In [None]:
import pandas as pd
import numpy as np
import re
import bisect
import seaborn as sns
import scipy
import scipy.cluster.hierarchy as sch
import plotly.express as px
import matplotlib.pyplot as plt
import networkx as nx
import plotly.figure_factory as ff
import plotly.graph_objs as go

from itertools import chain
from math import pi
from sklearn import preprocessing
from GGLasso.gglasso.problem import glasso_problem
from GGLasso.gglasso.helper.model_selection import ebic
from utils import transform_features, scale_array_by_diagonal
from utils import PCA

from matplotlib.colors import ListedColormap

from scipy import stats
from scipy.spatial import distance

from networkx.utils import cuthill_mckee_ordering

from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, HoverTool, LabelSet, PointDrawTool
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import RdBu, Blues8
from bokeh.models import HoverTool, Panel, Tabs, ColorBar, LinearColorMapper
from bokeh.layouts import row

# from latentcor import gen_data, get_tps, latentcor

In [None]:
def PCA(X, L, inverse=True):
    sig, V = np.linalg.eigh(L)

    # sort eigenvalues in descending order
    sig = sig[::-1]
    V = V[:, ::-1]

    ind = np.argwhere(sig > 1e-9)

    if inverse:
        loadings = V[:, ind] @ np.diag(np.sqrt(1 / sig[ind]))
    else:
        loadings = V[:, ind] @ np.diag(np.sqrt(sig[ind]))

    # compute the projection
    zu = X.values @ loadings

    return zu, loadings, np.round(sig[ind].squeeze(), 3)

In [None]:
def plotly_heatmap(z, x, y, title: str, x_label: str, y_label: str, zmin: int, zmax: int,
            height: int=1200, width: int=1200, colorscale: str='RdBu_r'):
    # Create a Plotly heatmap using the correlation matrix
    heatmap = go.Heatmap(z=z, x=x, y=y, colorscale=colorscale, zmin = zmin, zmax = zmax)
    # Create a layout for the heatmap
    layout = go.Layout(title=title, xaxis=dict(title=x_label), yaxis=dict(title=y_label), 
                       height=height, width=width, xaxis_tickangle=45)
    
    # Create a figure object and add the heatmap to it
    fig = go.Figure(data=[heatmap], layout=layout)
    
    return fig

In [None]:
def _make_heatmap(data: pd.DataFrame(), title: str = None, labels_dict: dict = None,
                  labels_dict_reversed: dict = None,
                  width: int = 1500, height: int = 1500, label_size: str = "5pt",
                  title_size: str = "24pt", not_low_rank: bool = True):
    nlabels = len(labels_dict)
    shifted_labels_dict = {k + 0.5: v for k, v in labels_dict.items()}
    shifted_labels_dict_reversed = {k + 0.5: v for k, v in labels_dict_reversed.items()}

    df = data.iloc[::-1]  # rotate matrix 90 degrees
    df = pd.DataFrame(df.stack(), columns=['covariance']).reset_index()
    df.columns = ["taxa_y", "taxa_x", "covariance"]
    df = df.replace({"taxa_x": labels_dict, "taxa_y": labels_dict})

    color_list, colors = _get_colors(df=df)
    # min_value = df['covariance'].min()
    # max_value = df['covariance'].max()
    # mapper = LinearColorMapper(palette=colors, low=min_value, high=max_value)
    mapper = LinearColorMapper(palette=colors, low=-1, high=1)
    color_bar = ColorBar(color_mapper=mapper, location=(0, 0))

    bottom, top, left, right = _get_bounds(nlabels=nlabels)

    source = ColumnDataSource(
        dict(top=top, bottom=bottom, left=left, right=right, color_list=color_list,
             taxa_x=df['taxa_x'], taxa_y=df['taxa_y'], covariance=df['covariance']))

    bokeh_tools = ["save, zoom_in, zoom_out, wheel_zoom, box_zoom, crosshair, reset, hover"]

    p = figure(plot_width=width, plot_height=height, x_range=(0, nlabels), y_range=(0, nlabels),
               title=title, title_location='above', x_axis_location="below",
               tools=bokeh_tools, toolbar_location='left')

    p.quad(top="top", bottom="bottom", left="left", right="right", line_color='white',
           color="color_list", source=source)
    p.xaxis.major_label_orientation = pi / 4
    p.yaxis.major_label_orientation = "horizontal"
    p.xaxis.major_label_text_font_size = label_size
    p.yaxis.major_label_text_font_size = label_size
    p.title.text_font_size = title_size
    p.add_layout(color_bar, 'right')
    p.toolbar.autohide = True
    p.xaxis.ticker = [x + 0.5 for x in
                      list(range(0, nlabels))]  ### shift label position to the center
    p.yaxis.ticker = [x + 0.5 for x in list(range(0, nlabels))]
    p.xaxis.major_label_overrides = shifted_labels_dict
    p.yaxis.major_label_overrides = shifted_labels_dict_reversed

    hover = p.select(dict(type=HoverTool))
    hover.tooltips = [
        ("taxa_x", "@taxa_x"),
        ("taxa_y", "@taxa_y"),
        ("covariance", "@covariance"),
    ]

    return p

In [None]:
def plot_network(G, title, width, height, node_size=None, amplify_x=10):
    #Establish which categories will appear when hovering over each node
    HOVER_TOOLTIPS = [("Character", "@index")]
    hover = HoverTool(tooltips=[('','@index')])
    tools = ["save, zoom_in, zoom_out, wheel_zoom, box_zoom, crosshair, reset, hover, pan"]

    #Create a plot — set dimensions, toolbar, and title
    plot = figure(tooltips = HOVER_TOOLTIPS, plot_width=width, plot_height=height,
                  tools=tools, active_scroll='wheel_zoom',
                x_range=Range1d(-10.1, 10.1), 
                  y_range=Range1d(-10.1, 10.1), title=title)
    
    
    
    color_map = ["#88CCEE" if "ASV" in j else "#DDCC77" for j in G.nodes()] 
    nx.set_node_attributes(G, {j: {'color': color_map[i]} for i, j in enumerate(G.nodes())})

    if node_size is not None:
        n_degrees = {k: 15*v for k,v in G.degree()} 
        nx.set_node_attributes(G, n_degrees, 'node_size')
        node_size = 'node_size'
    else:
        node_size = 40

    network_graph = from_networkx(G, nx.spring_layout, scale=10, center=(0, 0))


    #Set node size and color
    network_graph.node_renderer.glyph = Circle(size=node_size,  fill_color="color")
    
    #Set edge width and color green - positive, red - negative
    network_graph.edge_renderer.data_source.data["line_width"] = [G.get_edge_data(a,b)['covariance']*amplify_x for a, b in G.edges()]  ### amplify edges strengh
    network_graph.edge_renderer.data_source.data["line_color"] = ["#117733" if G.get_edge_data(a, b)['covariance'] >= 0 else "#CC6677" for a, b in G.edges()]
    network_graph.edge_renderer.glyph.line_width = {'field': 'line_width'} 
    network_graph.edge_renderer.glyph.line_color = {'field': 'line_color'}

    #Add network graph to the plot
    plot.renderers.append(network_graph)
    
    x, y = zip(*network_graph.layout_provider.graph_layout.values())
    node_labels = list(G.nodes)
    source = ColumnDataSource({'x': x, 'y': y, 'asv': [node_labels[i] for i in range(len(x))]})
    labels = LabelSet(x='x', y='y', text='asv', x_offset=30, y_offset=-15, source=source, render_mode='canvas', text_font_size='12pt')

    plot.renderers.append(labels)    

    return plot

In [None]:
def cluster_corr(corr_array, inplace=False):
    """
    Rearranges the correlation matrix, corr_array, so that groups of highly 
    correlated variables are next to eachother 
    
    Parameters
    ----------
    corr_array : pandas.DataFrame or numpy.ndarray
        a NxN correlation matrix 
        
    Returns
    -------
    pandas.DataFrame or numpy.ndarray
        a NxN correlation matrix with the columns and rows rearranged
    """
    pairwise_distances = sch.distance.pdist(corr_array)
    linkage = sch.linkage(pairwise_distances, method='complete')
    cluster_distance_threshold = pairwise_distances.max()/2
    idx_to_cluster_array = sch.fcluster(linkage, cluster_distance_threshold, 
                                        criterion='distance')
    idx = np.argsort(idx_to_cluster_array)
    
    if not inplace:
        corr_array = corr_array.copy()
    
    if isinstance(corr_array, pd.DataFrame):
        return corr_array.iloc[idx, :].T.iloc[idx, :]
    return corr_array[idx, :][:, idx]


# fig = px.imshow(-1*cluster_corr(precision_SGL), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
# fig.update_layout(margin = dict(t=100,r=100,b=100,l=100), width = 1000, height = 1000,
#                  title='Clustered Estimated inverse covariance: ASVs', title_x=0.5)

In [None]:
def create_graph(corr_matrix: pd.DataFrame(), threshold: float):
    #take the upper part only
    upper = np.triu(np.ones(corr_matrix.shape)).astype(bool)
    df = corr_matrix.where(upper)
    df = pd.DataFrame(corr_matrix.stack(), columns=['covariance']).reset_index()
    df.columns = ["source", "target", "covariance"]
    
    #remove diagonal entries
    #df = df[df['covariance'] <= threshold]
    df = df[abs(df['covariance']) >= threshold]
    #remove diagonal entries
    df = df[df['source'] != df['target']]
    #remove zero entries
    df = df[df['covariance'] != 0]
    
    #build graph
    G = nx.from_pandas_edgelist(df, edge_attr="covariance")
    
    return G

In [None]:
def project_covariates(counts=pd.DataFrame(), metadata=pd.DataFrame(), L=np.ndarray, y=str, PC=0):
    proj, loadings, eigv = PCA(counts.dropna(), L, inverse=True)
    r = np.linalg.matrix_rank(L)
    eigv_sum = np.sum(eigv)
    var_exp = [(value / eigv_sum) for value in sorted(eigv, reverse=True)]
    
    depth = pd.DataFrame(data=raw.sum(axis=0), columns=["sequencing depth"])
    metadata = depth.join(metadata)
    
    pc_columns = list('PC{0} ({1}%)'.format(i+1, str(100 * var_exp[i])[:4]) for i in range(0, r))
    df_proj = pd.DataFrame(proj, columns=pc_columns, index=counts.index)
    df = df_proj.join(metadata)
    
    varName1 = 'PC{0} ({1}%)'.format(PC+1, str(100 * var_exp[PC])[:4])
    varName2 = y
    # varName2 = 'PC{0} ({1}%)'.format(PC+2, str(100 * var_exp[1])[:4])
    df['x'] = df[varName1]
    df['y'] = df[varName2]

    source = ColumnDataSource(df)

    p0 = figure(tools='save, zoom_in, zoom_out, wheel_zoom, box_zoom, reset', plot_width=800, plot_height=800,
                active_scroll="wheel_zoom",
                x_axis_label=varName1, y_axis_label=varName2,
                tooltips=[(varName1, "@" + varName1),
                          (varName2, "@" + varName2)
                          ],
                title=varName1 + " vs " + varName2)
    
    
    
    rdbu = plt.get_cmap('Blues_r')
    cmap = ListedColormap(rdbu(np.arange(256)))
    # Create a list of hex color codes from the colormap
    colors = [cmap(i)[:3] for i in range(256)]
    colors = ['#' + ''.join([format(int(c * 255), '02x') for c in color]) for color in colors]
    colors = colors[::-1]  # red - positive, blue - negative
    exp_cmap = LinearColorMapper(palette=colors, low=depth.values.min(), high=depth.values.max())
    
    #exp_cmap = LinearColorMapper(palette=Blues8[::-1], low=min(df['sequencing depth'].values), high=max(df['sequencing depth'].values))
    p0.circle('x', 'y', source=source, size=15, line_color=None, fill_color={"field": "sequencing depth", "transform": exp_cmap}, fill_alpha=0.3)

    color_bar_plot = figure(title='sequencing depth', title_location="right",
                            height=500, width=150, toolbar_location=None, min_border=0,
                            outline_line_color=None)

    bar = ColorBar(color_mapper=exp_cmap, location=(1, 1))
    #bar = ColorBar(color_mapper=exp_cmap, location=(1, 1))

    color_bar_plot.add_layout(bar, 'right')
    color_bar_plot.title.align = "center"
    color_bar_plot.title.text_font_size = '12pt'

    layout = row(p0, color_bar_plot)

    return layout

In [None]:
def add_labels(df):
    i = 1
    for col in df.columns:
        # length of ASVs identifier
        if len(col) == 32:
            asv_name = "ASV_{0}".format(i)
            id_dict[asv_name] = col
            df.rename(columns={col: asv_name}, inplace=True)

            i += 1
    return df

In [None]:
def _get_bounds(nlabels: int):
    bottom = list(chain.from_iterable([[ii] * nlabels for ii in range(nlabels)]))
    top = list(chain.from_iterable([[ii + 1] * nlabels for ii in range(nlabels)]))
    left = list(chain.from_iterable([list(range(nlabels)) for ii in range(nlabels)]))
    right = list(chain.from_iterable([list(range(1, nlabels + 1)) for ii in range(nlabels)]))

    return bottom, top, left, right

In [None]:
def _get_colors(df: pd.DataFrame()):
    rdbu = plt.get_cmap('RdBu')
    cmap = ListedColormap(rdbu(np.arange(256)))
    
    # Create a list of hex color codes from the colormap
    colors = [cmap(i)[:3] for i in range(256)]
    colors = ['#' + ''.join([format(int(c * 255), '02x') for c in color]) for color in colors]
    colors = colors[::-1]  # red - positive, blue - negative

    ccorr = np.arange(-1, 1, 1 / (len(colors) / 2))
    color_list = []
    for value in df.covariance.values:
        ind = bisect.bisect_left(ccorr, value)  # smart array insertion
        if ind == 0:  # avoid ind == -1 on the next step
            ind = ind + 1
        color_list.append(colors[ind - 1])
    return color_list, colors

In [None]:
def create_label_dict(df):
    n_labels = len(df.columns)
    labels_dict = dict(zip(range(n_labels), df.columns))
    labels_dict_reversed = dict(zip(range(n_labels),list(labels_dict.values())[::-1]))
    
    return labels_dict, labels_dict_reversed

In [None]:
def scater_plot(x, y, width=800, height=600, size=3):
    bokeh_tools = ["save, zoom_in, zoom_out, wheel_zoom, box_zoom, crosshair, reset, hover"]
    p = figure(plot_width=width, plot_height=height, tools=bokeh_tools, toolbar_location='left')

    source = ColumnDataSource({'x': x, 'y': y})

    p.circle("x", "y", size=3*size, source=source, line_color=None)

    p.xaxis.axis_label = x.name
    p.yaxis.axis_label = y.name
    
    return p

# Data preprocessing

In [None]:
#count table
# raw = pd.read_csv('data/composition_feature-table.tsv', sep='\t', index_col = 0)

raw = pd.read_csv("data/atacama_counts.tsv", sep='\t', index_col = 0)

print("Some columns contain only zeros:", (raw == 0).all().any())

raw_T = raw.T 

### zero-inflation per ASV across all samples
zero_perc = (raw_T == 0).mean()

# mask = zero_perc > 0.8

mask = zero_perc > 0.9

raw_filt_T = raw_T.drop(columns=zero_perc[mask].index)

raw_filt = raw_filt_T.T

print("Some columns contain only zeros:", (raw_filt == 0).all().any())

zero_cols = [col for col in raw_filt.columns if all(raw_filt[col] == 0)]

raw_filt_final = raw_filt.drop(zero_cols, axis=1)

print("Some columns contain only zeros:", (raw_filt_final == 0).all().any())

raw_filt_final

In [None]:
taxa = pd.read_csv("data/taxonomy.tsv", sep="\t", index_col=0)

taxonomy_levels = {"domain": '^d__', 
                   "phylum": '^p__', 
                   "class": '^c__', 
                   "order": '^o__', 
                   "family": '^f__',
                   "genus": '^g__', 
                   "species": '^s__'
                  }

# split taxonomic ranks in different columns
taxa_sep = taxa['Taxon'].str.split(';', expand=True)

#rename taxonomic ranks with full names
taxa_sep.columns = taxonomy_levels.keys()

# drop missing species
taxa_sep = taxa_sep[taxa_sep.species.notnull()]

# remove blank spaces from taxonomic ranks
taxa_sep[taxa_sep.columns] = taxa_sep.apply(lambda x: x.str.strip())

taxa_sep.shape

# substract "s" from the string names
taxa_sep['species'] = taxa_sep['species'].map(lambda x: x.lstrip('s'))
taxa_sep['species'] = taxa_sep['genus'] + taxa_sep['species']

species = taxa_sep['species'].to_dict()

species_names = dict(taxa_sep['genus'])

# test = df.copy()

# test = test.rename(columns=species_names)
# test.columns

# new_col_names = {col: f"ASV_{i}" for i, col in enumerate(test.columns) if len(col) == 32}

# # Use the `rename()` method to change the column names using the dictionary
# test = test.rename(columns=new_col_names)

# test.to_csv("data/asv_covariates.csv")

# taxa_dict = dict()

# for level in taxa_sep.columns:
#     df_level = raw.join(taxa_sep[level])
#     df_level = df_level.groupby(level).sum()
#     taxa_dict[level] = df_level
    
# taxa_dict["ASV"] = raw.copy()
    
# taxa_dict.keys()

In [None]:
raw_filt_final.index = raw_filt_final.index.map(species)
raw_filt_final.index = raw_filt_final.index.fillna('unknown species')

# Check and update index values with length less than 5
new_index = []
i = 1
for idx in raw_filt_final.index:
    if idx == "unknown species":
        new_idx = f"unknown species {i}"
        i += 1
    else:
        new_idx = idx
    new_index.append(new_idx)
raw_filt_final.index = new_index

In [None]:
raw_filt_final.shape

In [None]:
colorscale = [[0, 'rgb(255, 255, 255)'], [1, 'rgb(105, 0, 95)']]

fig_raw = plotly_heatmap(z=raw_filt_final, x=raw_filt_final.columns, y=raw_filt_final.index, zmin = 0, zmax = 1,
                   title='Data Heatmap', x_label='Samples', y_label ='Taxa', width=1400, height=900, colorscale= colorscale)

fig_raw.show()

# fig_raw.write_image("plots/raw_heatmap.png")

In [None]:
(raw_filt_final.T == 0).mean()

In [None]:
fig, ax = plt.subplots(figsize=(18, 8))

sns.heatmap(raw_filt_final, cmap='coolwarm')
plt.show()

In [None]:
# count_dict = dict()

# for key, item in taxa_dict.items():
#     print("\n", key)
    
#     df = taxa_dict[key]
#     if (df == 0).all().any():
        
#         zero_cols = df.columns[(df == 0).all()]
#         print("{0} samples are dropped since there is no variance". format(list(zero_cols)))

#         df = df.drop(zero_cols, axis=1)
#         print("Shape BEFORE aggregation:", item.shape)
#         print("Shape AFTER aggregation:", df.shape)
#     print("Some columns contain only zeros:", (df == 0).all().any())
    
#     # # # # calculate percentage of zeros for each row
#     percentage_zeros = (df == 0).sum(axis=1) / len(df.columns) * 100
        
#     # drop rows with more than 80% of zeros
#     df_filt = df[percentage_zeros <= 80]
#     if (df_filt == 0).all().any():
        
#         zero_cols = df_filt.columns[(df_filt == 0).all()]
#         print("{0} samples are dropped since there is no variance". format(list(zero_cols)))

#         df_filt = df_filt.drop(zero_cols, axis=1)
#         print("Shape BEFORE filtering:", df.shape)
#         print("Shape AFTER filtering:", df_filt.shape)
#     print("Some columns contain only zeros:", (df_filt == 0).all().any())
        
#     count_dict[key] = df_filt

In [None]:
# for key, item in count_dict.items():
#     if item.shape[0] > 1:
#         # Create the scatter matrix
#         fig = ff.create_scatterplotmatrix(
#             item.T, 
#             diag='histogram',
#             height=1600, width=1600,
#             title=key
#         )

#         # Show the plot
#         fig.write_image("plots/{}_scatter.png".format(key))
# fig = ff.create_scatterplotmatrix(
#             raw_filt_final.T, 
#             diag='histogram',
#             height=1600, width=1600,
#             title=key
#         )

# fig.show()

In [None]:
### clr-transformation
# clr = transform_features(raw, transformation="mclr")
# clr = transform_features(phylum, transformation="mclr")
# clr = transform_features(phylum_filt, transformation="mclr")
# clr = transform_features(count_dict['ASV'], transformation="mclr")

clr = transform_features(raw_filt_final, transformation="mclr")
# clr.isna().any()

In [None]:
# colorscale = [[0, 'rgb(255, 255, 255)'], [1, 'rgb(20, 83, 145)']]

# fig_clr = plotly_heatmap(z=clr, x=clr.columns, y=clr.index, zmin = 0, zmax = 1,
#                    title='Data Heatmap', x_label='Samples', y_label ='Taxa', width=1400, height=900, colorscale= colorscale)

# fig_clr.show()

# # fig_raw.write_image("plots/raw_heatmap.png")

In [None]:
import plotly.offline as pyo

def heatmap(z, x, y, title: str, x_label: str, y_label: str, zmin: int, zmax: int,
            height: int=1200, width: int=1200):
    # Create a Plotly heatmap using the correlation matrix
    heatmap = go.Heatmap(z=z, x=x, y=y, colorscale='RdBu_r', zmin = zmin, zmax = zmax)
    # Create a layout for the heatmap
    layout = go.Layout(title=title, xaxis=dict(title=x_label), yaxis=dict(title=y_label), 
                       height=height, width=width)
    # Create a figure object and add the heatmap to it
    fig = go.Figure(data=[heatmap], layout=layout)
    
    return fig

In [None]:
clr.values.min()

In [None]:
fig_X = heatmap(z=clr.T, x=clr.index, y=clr.columns, zmin = clr.values.min(), zmax = clr.values.max(),
                title='mclr-transformed data', x_label='Samples', y_label ='Taxa')

# Display the heatmap
pyo.iplot(fig_X)

In [None]:
fig, ax = plt.subplots(figsize=(18, 8))

sns.heatmap(clr, cmap='coolwarm')
plt.show()

In [None]:
meta = pd.read_csv('data/acm_meta.tsv', sep='\t', index_col = 0)

# select only numeric features
meta = meta.loc[:, meta.iloc[0, :] != 'categorical']
meta = meta.apply(pd.to_numeric, errors='coerce')

### select most interesting columns
selected_columns = ['ph', 'toc', 'ec', 'average-soil-relative-humidity', 'average-soil-temperature']

meta = meta[selected_columns]

# drop QIIME2 header
meta = meta.iloc[1:]
# fill missing values with zeros
meta = meta.fillna(0)

#scale data
scaler = preprocessing.StandardScaler().fit(meta)
meta_scaled = scaler.transform(meta)
meta_scaled = pd.DataFrame(meta_scaled, index=meta.index, columns=meta.columns)
# meta_scaled.to_csv("data/meta_scaled.csv")

# transpose count data
clr_T = clr.T
# join by sample id
df = clr_T.join(meta_scaled)
# df.to_csv("data/asv_meta.csv", index=True)

# Rename long feature IDs with concise names
vis_df = df.copy()
# id_dict = dict()
# vis_df = add_labels(vis_df)

#calculate covariance
n_cov = meta_scaled.shape[1]
asv = df.iloc[:, :-n_cov]
# asv.to_csv("data/asv.csv", index=True)

### latent corr
# clean_types_asv = get_tps(asv)
# print(clean_types_asv)
# lat_cor_asv = latentcor(asv, tps = clean_types_asv, method ='original', use_nearPD=False)
# with open('data/lat_corr.npy', 'rb') as f:
#     corr = np.load(f)

S = np.cov(asv.T.values, bias=True)

# # # correlation between ASVs ONLY
corr = scale_array_by_diagonal(S)
# corr = lat_cor_asv["R"].values

# # #add labels
asv_names = vis_df.iloc[:, :-n_cov].columns
vis_S = pd.DataFrame(corr, columns=asv_names, index=asv_names)


# clean_types_all = get_tps(df)
# print(clean_types_all)
# lat_cor_all = latentcor(df, tps = clean_types_all, method ='original', use_nearPD=False)

# corr_meta = lat_cor_all["R"].values
# with open('data/lat_corr_meta.npy', 'rb') as f:
#     corr_meta = np.load(f)
# vis_S_meta = pd.DataFrame(corr_meta, columns=vis_df.columns, index=vis_df.columns)

# # # # # correlation between ASVs and covariates
S_meta = np.cov(df.T.values, bias=True)
corr_meta = scale_array_by_diagonal(S_meta)
vis_S_meta = pd.DataFrame(corr_meta, columns=vis_df.columns, index=vis_df.columns)

In [None]:
mean_values = scaler.mean_
std_values = scaler.scale_

std_values

In [None]:
fig, axis = plt.subplots(5,1,figsize=(5, 12))
# meta.hist(ax=axis,  color='#5C1360')
meta_scaled.hist(ax=axis,  color='#5C1360')

# fig.savefig('plots/meta_unscaled.png')
# fig.savefig('plots/meta_scaled.png')

In [None]:
clr.iloc[:, 1].plot.hist(bins=10, alpha=1, color='#5C1360').get_figure().savefig('plots/mclr_count.png')

In [None]:
raw_filt_final.iloc[:, 1].plot.hist(bins=10, alpha=1, color='#5C1360').get_figure().savefig('plots/raw_count.png')

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

sns.heatmap(corr, cmap='coolwarm')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

sns.heatmap(corr_meta, cmap='coolwarm')
plt.show()

In [None]:
width = 1500
height = 1500
label_size = "36pt"
lables_0, re_labels_0 = create_label_dict(vis_S)

p0 = _make_heatmap(data=vis_S, labels_dict=lables_0, labels_dict_reversed=re_labels_0,
                       title="Correlation: ASVs", width=width, height=height,
                       label_size=label_size)

meta_corr = vis_S_meta.iloc[-n_cov:, -n_cov:]
lables_1, re_labels_1 = create_label_dict(meta_corr)

p1 = _make_heatmap(data=meta_corr, labels_dict=lables_1, labels_dict_reversed=re_labels_1,
                       title="Correlation: covariates", width=width, height=height,
                       label_size=label_size)

# drop highly correlated covariates
# hcorr_cov = ['relative-humidity-soil-high', 'relative-humidity-soil-low', 'percent-relative-humidity-soil-100', 'temperature-soil-high', 'temperature-soil-low']
# hcorr_cov = ['relative-humidity-soil-low', 'percent-relative-humidity-soil-100', 'temperature-soil-high', 'temperature-soil-low']


# for frame in [vis_S_meta, df, vis_df]:
#     frame.drop(hcorr_cov, axis=1, inplace=True)
#     frame.rename(columns={'average-soil-relative-humidity':'average humidity','average-soil-temperature': 'average temperature',}, inplace=True)

vis_S_meta = vis_S_meta.T
# vis_S_meta.drop(hcorr_cov, axis=1, inplace=True)
# vis_S_meta.rename(columns={'average-soil-relative-humidity':'average humidity','average-soil-temperature': 'average temperature',}, inplace=True)

n_cov = df.shape[1] - asv.shape[1]
lables_2, re_labels_2 = create_label_dict(vis_S_meta)

p2 = _make_heatmap(data=vis_S_meta, labels_dict=lables_2, labels_dict_reversed=re_labels_2,
                       title="Correlation: ASVs + covariates", width=width, height=height,
                       label_size=label_size)

# show(p0)
# show(p1)
# show(p2)

In [None]:
g_vis_S = sns.clustermap(vis_S, method='average', cmap='RdBu', center=0, dendrogram_ratio=0.2, robust=True, cbar_pos=None)

# get the order of the rows and columns
row_order_vis_S = g_vis_S.dendrogram_row.reordered_ind
col_order_vis_S = g_vis_S.dendrogram_col.reordered_ind

vis_S_clust = vis_S.iloc[row_order_vis_S, col_order_vis_S]

lables_vis_S_clust, re_labels_vis_S_clust = create_label_dict(vis_S_clust)

p_vis_S_clust = _make_heatmap(data=vis_S_clust, labels_dict=lables_vis_S_clust, labels_dict_reversed=re_labels_vis_S_clust, # multiply by 3 for making edge visible on the heatmao
                       title="Clustered correlation", width=width, height=height,
                       label_size=label_size)

show(p_vis_S_clust)

In [None]:
def remove_after_underscore(dct):
    for key in dct:
        if '_' in dct[key]:
            dct[key] = dct[key].split('_')[0]
    return dct

In [None]:
width = 1500
height = 1500
label_size = "24pt"

latcorr = pd.read_csv("data/showcase_latent_corr.csv", index_col=0)

lables_3, re_labels_3 = create_label_dict(latcorr)

lables_3 = remove_after_underscore(lables_3)
re_labels_3 = remove_after_underscore(re_labels_3)

p3 = _make_heatmap(data=latcorr, labels_dict=lables_3, labels_dict_reversed=re_labels_3,
                       title="Correlation: ASVs + covariates", width=width, height=height,
                       label_size=label_size)
show(p3)

# Models

In [None]:
N = asv.shape[0]
p = asv.shape[1]
print("Shape of data without covariates: {0}, {1}".format(N, p))

N_meta = df.shape[0]
p_meta = df.shape[1]
print("Shape of data with covariates: {0}, {1}".format(N_meta, p_meta))

#hyperparameters
# lambda1_range = np.logspace(0, -4, 15)
# mu1_range = np.logspace(0.9, 0.4, 10)

lambda1_range = np.logspace(0, -3, 50)
# mu1_range = np.logspace(-2, -2.5, 10)
### for 0.9
mu1_range = np.logspace(0, -2, 10)

### for 0.8
# mu1_range = np.logspace(-0.2, -0.5, 10)

# lambda1_range = np.logspace(0, -1, 15)
# mu1_range = np.logspace(-0.5, -4, 10)
# lambda1_range = np.logspace(0, -2, 30)
# mu1_range = np.logspace(-0.5, -2, 10)
# lambda1_range = np.logspace(-1, -4, 30)
# mu1_range = np.logspace(-0.5, -2, 10)
modelselect_params = {'lambda1_range': lambda1_range, 'mu1_range': mu1_range}

P_SGL = glasso_problem(corr, N, latent=False, do_scaling=False)
P_SGL.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=0.01, store_all=True)

P_SGL_low = glasso_problem(corr, N, latent=True, do_scaling=False)
P_SGL_low.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=0.01, store_all=True)

# create lambda matrix full of zeros
shape_meta = (p_meta, p_meta)
mask = np.zeros(shape_meta)
# add small constant, so ADMM could converge
mask = mask + 0.01
# heavy penalize species
n_bugs = len(asv.columns)
bugs_block = np.ones((n_bugs, n_bugs))
mask[0:n_bugs, 0:n_bugs] += bugs_block - 0.01
lambda1_mask_exp = mask
df_mask_exp = pd.DataFrame(lambda1_mask_exp, columns=vis_df.columns, index=vis_df.columns)

modelselect_params["lambda1_mask"] = lambda1_mask_exp
P_SGL_adapt = glasso_problem(vis_S_meta.values, N_meta, latent=False, do_scaling=False)
P_SGL_adapt.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=0.01, store_all=True)

In [None]:
print("SGL solution with lambda={lambda1} and mu={mu1}".format(**P_SGL.reg_params))
print("Adaptive SGL+low-rank solution with lambda={lambda1} and mu={mu1}".format(**P_SGL_adapt.reg_params))
print("SGL+low-rank solution with lambda={lambda1} and mu={mu1}".format(**P_SGL_low.reg_params))

In [None]:
P_SGL.modelselect_stats

In [None]:
ebic(S=corr, Theta=P_SGL.solution.precision_, N=N, gamma=0.01)

In [None]:
def calculate_ebic(lambda1_range, P_SGL, P_SGL_low, P_SGL_adapt):
    """
    Calculate eBIC for different solvers, lambda values, and gamma values.
    
    Args:
        lambda1_range (list): List of lambda values.
        P_SGL (glasso_problem obj): Object for P_SGL solver.
        P_SGL_low (glasso_problem obj): Object for P_SGL_low solver.
        P_SGL_adapt (glasso_problem obj): Object for P_SGL_adapt solver.
    
    Returns:
        pandas.DataFrame: DataFrame containing solver, lambda, gamma, and eBIC statistics.
    """
    
    P_SGL.label = "sgl"
    P_SGL_low.label = "low"
    P_SGL_adapt.label = "adapt"

    stats_dict = { 'solver': [], 'lambda': [], 'gamma': [], 'eBIC': [], 'SP': []}

    gamma_list = [0.001, 0.01, 0.1, 0.15, 0.25, 0.5, 0.7, 1]

    for sol in [P_SGL, P_SGL_low, P_SGL_adapt]:
        sigma = sol.solution.sample_covariance_
        n = sol.solution.n_samples

        for i in range(0, len(lambda1_range)):
            theta = sol._all_theta[i, 0]
            lambda1 = lambda1_range[i]
            sparsity = sol.modelselect_stats["SP"][i][0]

            for gamma in gamma_list:
                stats_dict['solver'].append(sol.label)
                stats_dict['lambda'].append(lambda1)
                stats_dict['gamma'].append(gamma)
                stats_dict['eBIC'].append(ebic(S=sigma, Theta=theta, N=n, gamma=gamma))
                stats_dict['SP'].append(sparsity)

    stats_df = pd.DataFrame(stats_dict)
    
    return stats_df

In [None]:
result_df = calculate_ebic(lambda1_range, P_SGL, P_SGL_low, P_SGL_adapt)
result_df[result_df["lambda"] == 1]

In [None]:
# l_list = []
# g_list = []
# ebic_list = []

# for l1 in np.unique(test["lambda"]):
#     a = test[test['lambda'] == l1]
    
#     b = np.array(a["lambda"])
#     d = np.array(a["gamma"])
#     e = np.array(a["eBIC"])
    
#     l_list.append(b)
#     g_list.append(b)
#     ebic_list.append(b)

# l_list = np.array(l_list)
# g_list = np.array(g_list)
# ebic_list = np.array(ebic_list)
# l_list.shape, g_list.shape, ebic_list.shape

In [None]:
for solver in result_df["solver"].unique():
    fig = px.scatter_3d(result_df[result_df["solver"] == solver], x='lambda', y='gamma', z='eBIC')

    # Set labels and title
    fig.update_layout(
        scene=dict(
            xaxis_title='log(lambda)',
            yaxis_title='gamma',
            zaxis_title='eBIC',
            xaxis_type="log"
        ),
        title=solver,
        width=1000,  # Set the width of the plot (in pixels)
        height=1000  # Set the height of the plot (in pixels)
    )

    # Show the plot
    fig.write_html("plots/{0}_ebic_3D.html".format(solver))

In [None]:
gamma_list = []
lambda_list = []

for l1 in test['lambda'].unique():
    print(l1)
    # Filter the DataFrame for lambda = 1
    filtered_df = test[test['lambda'] == l1]

    # # Find the gamma value where eBIC is the smallest
    min_gamma = test.loc[test['eBIC'].idxmin(), 'gamma']
    lambda_list.append(l1)
    gamma_list.append(min_gamma)

In [None]:
len(gamma_list)

In [None]:
test = result_df[result_df["solver"] == 'sgl']

# Find the lambda and gamma values with the lowest eBIC for each solver
min_eBIC_values = test.groupby(["lambda"])['eBIC'].min()
min_eBIC_values

In [None]:
len(test["lambda"].unique())

In [None]:
# Initialize variables to store coordinates of the minimum eBIC value
for solver in result_df["solver"].unique():
    df = result_df[result_df["solver"] == solver]
    # Create heatmap plot
    fig = go.Figure(data=go.Heatmap(
        x=np.log(df['lambda']),
        y=df['gamma'],
        z=np.log(df['eBIC']),
        colorscale='Viridis',
        colorbar=dict(title='log(eBIC)')
    ))
    # Set labels and title
    fig.update_layout(
        title=solver,
        xaxis_title='log(lambda)',
        yaxis_title='gamma',
        width=900,  # Set the width of the plot (in pixels)
        height=600  # Set the height of the plot (in pixels)
    )
    
    fig.show()

In [None]:
for solver in result_df["solver"].unique():
    df = result_df[result_df["solver"] == solver]
    # Create heatmap plot
    fig = go.Figure(data=go.Heatmap(
        x=np.log(df['lambda']),
        y=df['gamma'],
        z=np.log(df['eBIC']),
        colorscale='Viridis',
        colorbar=dict(title='log(eBIC)')
    ))
    # Set labels and title
    fig.update_layout(
        title = solver,
        xaxis_title='log(lambda)',
        yaxis_title='gamma',
        width=900,  # Set the width of the plot (in pixels)
        height=600  # Set the height of the plot (in pixels)
    )

    gamma_list = []
    lambda_list = []

    for l1 in test['lambda'].unique():
        # Filter the DataFrame for lambda = 1
        filtered_df = test[test['lambda'] == l1]

        # # Find the gamma value where eBIC is the smallest
        min_gamma = test.loc[test['eBIC'].idxmin(), 'gamma']
        lambda_list.append(l1)
        gamma_list.append(min_gamma)
        
    
    # Mark the minimum eBIC value with a dot
    fig.add_trace(go.Scatter(
        x=np.log(lambda_list),
        y=gamma_list,
        mode='lines',
        marker=dict(
            size=20,
            color='red'
        ),
        showlegend=False
    ))
    # Add annotation
    fig.add_annotation(
        x=np.log(lambda_list[5]),
        y=gamma_list[0],
        text="Minimum eBIC",
        font=dict(size=18, color ="white"),
        ax=-10,
        ay=-10
    )
    
    fig.show()
    # Show the plot
    fig.write_image("plots/{0}_ebic_heatmap.png".format(solver))
    fig.write_image("plots/{0}_ebic_heatmap.pdf".format(solver))

In [None]:
for solver in result_df["solver"].unique():
    df = result_df[result_df["solver"] == solver]
    # Create heatmap plot
    fig = go.Figure(data=go.Heatmap(
        x=df['lambda'],
        y=df['gamma'],
        z=df['SP'],
        colorscale='Viridis',
        colorbar=dict(title='SP')
    ))
    # Set labels and title
    fig.update_layout(
        title = solver,
        xaxis_title='lambda',
        yaxis_title='gamma',
        width=900,  # Set the width of the plot (in pixels)
        height=600  # Set the height of the plot (in pixels)
    )

    # Show the plot
    fig.show()
    fig.write_image("plots/{0}_SP_heatmap.png".format(solver))
    fig.write_image("plots/{0}_SP_heatmap.pdf".format(solver))

In [None]:
for key in P_SGL.modelselect_stats['BIC'].keys():
    x = P_SGL.modelselect_stats['BIC'][key]
    y = P_SGL.modelselect_stats['LAMBDA']

    trace = go.Scatter(x=y.flatten(), y=x.flatten(), mode='lines')
    data = [trace]

    layout = go.Layout(title='Lambda path vs. eBIC ({0})'.format(key), xaxis_title='LAMBDA', yaxis_title='log(eBIC)', yaxis_type='log')
    fig = go.Figure(data=data, layout=layout)
    fig.show()

In [None]:
x = P_SGL.modelselect_stats['SP']
y = P_SGL.modelselect_stats['LAMBDA']

trace = go.Scatter(x=y.flatten(), y=x.flatten(), mode='lines')
data = [trace]

layout = go.Layout(title='Lambda path vs. Sparsity', xaxis_title='LAMBDA', yaxis_title='sparsity')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
x = P_SGL_low.modelselect_stats['BIC'][0.1][:, 0]
y = P_SGL_low.modelselect_stats['LAMBDA'][:, 0]

trace = go.Scatter(x=y.flatten(), y=x.flatten(), mode='lines')
data = [trace]

layout = go.Layout(title='Lambda path vs. eBIC', xaxis_title='LAMBDA', yaxis_title='log(eBIC)', yaxis_type='log')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
mu1_range

In [None]:
x = P_SGL_low.modelselect_stats['SP'][:, 0]
y = P_SGL_low.modelselect_stats['LAMBDA'][:, 0]

trace = go.Scatter(x=y.flatten(), y=x.flatten(), mode='lines')
data = [trace]

layout = go.Layout(title='Lambda path vs. Sparsity', xaxis_title='LAMBDA', yaxis_title='sparsity')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
P_SGL_low.modelselect_stats['SP']

In [None]:
L = P_SGL_low.solution.lowrank_
r = np.linalg.matrix_rank(L)
r

In [None]:
P_SGL_low.modelselect_stats['RANK'].flatten()

In [None]:
P_SGL_low.modelselect_stats
# P_SGL.modelselect_stats['LAMBDA']
# P_SGL.modelselect_stats['MU']
# P_SGL.modelselect_stats['SP']

# Analysis

In [None]:
width = 1500
height = 1500
label_size = "8pt"

# for visualization reasons we transform inverse covaraince to negative inverse covaraince, i.e., multiply by -1
sgl = -1 * pd.DataFrame(P_SGL.solution.precision_, columns=asv_names, index=asv_names)
adapt = -1 * pd.DataFrame(P_SGL_adapt.solution.precision_, columns=vis_df.columns, index=vis_df.columns)
low = -1 * pd.DataFrame(P_SGL_low.solution.precision_, columns=asv_names, index=asv_names)


lables_sgl, re_labels_sgl = create_label_dict(sgl)
lables_adapt, re_labels_adapt = create_label_dict(adapt)
lables_low, re_labels_low = create_label_dict(low)

p_sgl = _make_heatmap(data=sgl, labels_dict=lables_sgl, labels_dict_reversed=re_labels_sgl, # multiply by 3 for making edge visible on the heatmao
                       title="SGL estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

p_adapt = _make_heatmap(data=adapt, labels_dict=lables_adapt, labels_dict_reversed=re_labels_adapt,
                       title="Adaptive estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

p_low = _make_heatmap(data=low, labels_dict=lables_low, labels_dict_reversed=re_labels_low,
                       title="SGL+low-rank estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)
show(p_sgl)
show(p_adapt)
show(p_low)

In [None]:
# g_sgl = sns.clustermap(sgl, method='average', cmap='RdBu', center=0, dendrogram_ratio=0.2, robust=True, cbar_pos=None)

# # get the order of the rows and columns
# row_order_sgl = g_sgl.dendrogram_row.reordered_ind
# col_order_sgl = g_sgl.dendrogram_col.reordered_ind

sgl_clust = sgl.iloc[row_order_vis_S, col_order_vis_S]

lables_sgl_clust, re_labels_sgl_clust = create_label_dict(sgl_clust)

p_sgl_clust = _make_heatmap(data=sgl_clust, labels_dict=lables_sgl_clust, labels_dict_reversed=re_labels_sgl_clust, # multiply by 3 for making edge visible on the heatmao
                       title="Clustered SGL estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

show(p_sgl_clust)

In [None]:
labels_dict = dict(zip(row_order_vis_S, dictionary))

In [None]:
asv_part_adapt = adapt.iloc[:-n_cov, :-n_cov].iloc[row_order_vis_S, col_order_vis_S]

asv_part_adapt.columns

covs_adapt = adapt.iloc[:-n_cov, -n_cov:].iloc[row_order_vis_S, :]
covs_adapt

only_cov = adapt.iloc[-n_cov:, -n_cov:]

a = asv_part_adapt.values
b = covs_adapt.values
c = only_cov.values

# Combine arrays a, b_reshaped, and c_reshaped along the last axis (axis=-1)
res = np.block([
    [a, b],
    [b.T, c]])

adapt_clust = pd.DataFrame(res, index=list(asv_part_adapt.columns) + list(only_cov.columns), columns=list(asv_part_adapt.columns) + list(only_cov.columns))
adapt_clust
# lables_adapt_clust, re_labels_adapt_clust = create_label_dict(adapt_clust)

# p_adapt_clust = _make_heatmap(data=adapt_clust, labels_dict=lables_adapt_clust, labels_dict_reversed=re_labels_adapt_clust, # multiply by 3 for making edge visible on the heatmao
#                        title="Clustered Adaptive SGL estimated (negative) inverse covariance", width=width, height=height,
#                        label_size=label_size)

# show(p_adapt_clust)

In [None]:
# g_lsp = sns.clustermap(low, method='average', cmap='RdBu', center=0, dendrogram_ratio=0.2, robust=True, cbar_pos=None)

# # get the order of the rows and columns
# row_order_lsp = g_lsp.dendrogram_row.reordered_ind
# col_order_lsp = g_lsp.dendrogram_col.reordered_ind

lsp_clust = low.iloc[row_order_vis_S, col_order_vis_S]

lables_lsp_clust, re_labels_lsp_clust = create_label_dict(lsp_clust)

p_lsp_clust = _make_heatmap(data=lsp_clust, labels_dict=lables_lsp_clust, labels_dict_reversed=re_labels_lsp_clust, # multiply by 3 for making edge visible on the heatmao
                       title="Clustered SGL estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

show(p_lsp_clust)

In [None]:
lrp = -1 * pd.DataFrame(P_SGL_low.solution.lowrank_, columns=asv_names, index=asv_names)

# g_lrp = sns.clustermap(lrp, method='average', cmap='RdBu', center=0, dendrogram_ratio=0.2, robust=True, cbar_pos=None)

# # get the order of the rows and columns
# row_order_lrp = g_lrp.dendrogram_row.reordered_ind
# col_order_lrp = g_lrp.dendrogram_col.reordered_ind

lrp_clust = lrp.iloc[row_order_vis_S, col_order_vis_S]

lables_lrp_clust, re_labels_lrp_clust = create_label_dict(lrp_clust)

p_sgl_lrp = _make_heatmap(data=lrp_clust, labels_dict=lables_lrp_clust, labels_dict_reversed=re_labels_lrp_clust, # multiply by 3 for making edge visible on the heatmao
                       title="Clustered Low Rank", width=width, height=height,
                       label_size=label_size)

show(p_sgl_lrp)

In [None]:
### for 0.9
meta_cols = list(adapt.iloc[:, -n_cov:].columns)
asv18_edges = ["ASV_18", "ASV_25", "ASV_27", "ASV_32", "ASV_34"]
asv25_32 =["ASV_25", "ASV_32"]
asv18_27 =["ASV_18", "ASV_27"]
asv18_edges_adapt = ['ph','average-soil-temperature', 'average-soil-relative-humidity'] + asv18_27
# asv18_edges_adapt = ['average-soil-temperature'] + asv18_edges

sgl_edges = sgl[sgl.columns.intersection(asv18_edges)].loc[asv18_edges]
adapt_edges = adapt[adapt.columns.intersection(asv18_edges_adapt)].loc[asv18_edges_adapt]
low_edges = low[low.columns.intersection(asv18_edges)].loc[asv18_edges]


G_SGL = create_graph(sgl_edges, threshold=0.0)
G_adapt = create_graph(adapt_edges, threshold=0.0)
G_low = create_graph(low_edges, threshold=0.0)


# G_SGL = create_graph(sgl, threshold=0.01)
# G_adapt = create_graph(adapt, threshold=0.01)
# G_low = create_graph(low, threshold=0.01)


width, height= 1000, 1000

network_sgl = plot_network(G_SGL, title="SGL", height=height, width=width, amplify_x=10)
network_adapt = plot_network(G_adapt, title="Adaptive",  height=height, width=width, amplify_x=10)
network_low = plot_network(G_low, title="Low-rank",  height=height, width=width, amplify_x=10)

show(network_sgl)
show(network_adapt)
show(network_low)

In [None]:
### for 0.8
# meta_cols = list(adapt.iloc[:, -n_cov:].columns)
# asv18_edges = ["ASV_3", "ASV_8", "ASV_9", "ASV_13"]
# # asv18_51 =["ASV_6", "ASV_17", "ASV_20"]
# # asv18_edges_adapt = ['ph','average-soil-temperature', 'average-soil-relative-humidity'] + asv18_edges
# asv18_edges_adapt = ['average-soil-temperature'] + asv18_edges

# sgl_edges = sgl[sgl.columns.intersection(asv18_edges)].loc[asv18_edges]
# adapt_edges = adapt[adapt.columns.intersection(asv18_edges_adapt)].loc[asv18_edges]
# low_edges = low[low.columns.intersection(asv18_edges)].loc[asv18_edges]


# G_SGL = create_graph(sgl_edges, threshold=0.0)
# G_adapt = create_graph(adapt_edges, threshold=0.0)
# G_low = create_graph(low_edges, threshold=0.0)


# # G_SGL = create_graph(sgl, threshold=0.01)
# # G_adapt = create_graph(adapt, threshold=0.01)
# # G_low = create_graph(low, threshold=0.01)


# width, height= 1000, 1000

# network_sgl = plot_network(G_SGL, title="SGL", height=height, width=width, amplify_x=50)
# network_adapt = plot_network(G_adapt, title="Adaptive",  height=height, width=width, amplify_x=10)
# network_low = plot_network(G_low, title="Low-rank",  height=height, width=width, amplify_x=50)

# show(network_sgl)
# show(network_adapt)
# show(network_low)

In [None]:
# p_18_51 = scater_plot(vis_df["ASV_27"], vis_df["ASV_18"])
p_18_temp = scater_plot(vis_df["ASV_27"], meta['ph'].loc[df.index])
p_51_temp = scater_plot(vis_df["ASV_18"], meta['ph'].loc[df.index])

# show(p_18_51)
show(p_18_temp)
show(p_51_temp)

In [None]:
asv.shape, L_1.shape

In [None]:
inv_cov = adapt.iloc[:-n_cov, -n_cov:]

L_adapt = inv_cov @ inv_cov.T
L_adapt.shape

L_1 = pd.DataFrame(P_SGL_low.solution.lowrank_, columns=asv_names, index=asv_names)
L_2 = pd.DataFrame(L_adapt, columns=asv_names, index=asv_names)

r1 = np.linalg.matrix_rank(L_1)
r2 = np.linalg.matrix_rank(L_2)

print("L1-rank: {0}".format(r1))
print("L2-rank: {0}".format(r2))

proj_1, loadings_1, eigv_1 = PCA(asv, L_1, inverse=True)

eigv_sum_1 = np.sum(eigv_1)
var_exp_1 = [(value / eigv_sum_1) for value in sorted(eigv_1, reverse=True)]

proj_2, loadings_2, eigv_2 = PCA(asv, L_2, inverse=True)

eigv_sum_2 = np.sum(eigv_2)
var_exp_2 = [(value / eigv_sum_2) for value in sorted(eigv_2, reverse=True)]

pca_plot = project_covariates(asv, metadata=meta, L=L_1, y='average-soil-temperature')
# pca_plot = project_covariates(asv, metadata=meta, L=L_1, y='ph', PC=0)
# pca_plot = project_covariates(asv, metadata=meta, L=L_1, y='PC2', PC=0)
# pca_plot = project_covariates(asv, metadata=meta, L=L_1, y='ph')
show(pca_plot)

### Project PCs on ASVs and covariates

In [None]:
pc_1 = pd.Series(proj_1[:, 0], index=asv.index, name='PC1')

p_18_temp = scater_plot(vis_df["ASV_27"], pc_1)
p_51_temp = scater_plot(vis_df["ASV_18"], pc_1)

show(p_18_temp)
show(p_51_temp)

In [None]:
p_temp = scater_plot(pc_1, meta["average-soil-temperature"].loc[pc_1.index])

show(p_temp)

In [None]:
width = 1500
height = 1500
label_size = "8pt"

adapt_theta = adapt.copy()

asv_cov = adapt_theta.iloc[:-n_cov, -n_cov:]

l1_norm = np.linalg.norm(asv_cov.values, axis=1)

adapt_theta['l1'] = np.append(l1_norm, np.zeros(n_cov))

adapt_theta = adapt_theta.T

adapt_theta['l1'] = np.append(l1_norm, np.zeros(n_cov+1))
adapt_theta = adapt_theta.sort_values(by=['l1'], ascending=False)
adapt_theta = adapt_theta.T
adapt_theta = adapt_theta.sort_values(by=['l1'], ascending=False)

lables_l1, re_labels_l1 = create_label_dict(adapt_theta)

p_l1 = _make_heatmap(data=adapt_theta, labels_dict=lables_l1, labels_dict_reversed=re_labels_l1,
                       title="Esatimated inverse covariance sorted by l1-norm of the covariates", width=width, height=height,
                       label_size=label_size)
show(p_l1)

In [None]:
sorted_order = adapt_theta.index[:-n_cov-1].values
sorted_order

In [None]:
width = 1500
height = 1500
label_size = "8pt"

# for visualization reasons we transform inverse covaraince to negative inverse covaraince, i.e., multiply by -1
sgl = -1 * pd.DataFrame(P_SGL.solution.precision_, columns=asv_names, index=asv_names)
adapt = -1 * pd.DataFrame(P_SGL_adapt.solution.precision_, columns=vis_df.columns, index=vis_df.columns)
low = -1 * pd.DataFrame(P_SGL_low.solution.precision_, columns=asv_names, index=asv_names)


sorted_sgl = sgl.reindex(index=sorted_order).T.reindex(index=sorted_order).T
sorted_low = low.reindex(index=sorted_order).T.reindex(index=sorted_order).T

sorted_lables_sgl, sorted_re_labels_sgl = create_label_dict(sorted_sgl)
sorted_lables_low, sorted_re_labels_low = create_label_dict(sorted_low)

sorted_p_sgl = _make_heatmap(data=sorted_sgl, labels_dict=sorted_lables_sgl, labels_dict_reversed=sorted_re_labels_sgl, # multiply by 3 for making edge visible on the heatmao
                       title="SGL estimated (negative) inverse covariance sorted by l1", width=width, height=height,
                       label_size=label_size)

sorted_p_low = _make_heatmap(data=sorted_low, labels_dict=sorted_lables_low, labels_dict_reversed=sorted_re_labels_low,
                       title="SGL+low-rank estimated (negative) inverse covariance sorted by l1", width=width, height=height,
                       label_size=label_size)
show(sorted_p_sgl)
show(sorted_p_low)

In [None]:
pc_components = pd.DataFrame(loadings_1, index=low.index)
pc_components = pc_components.iloc[::-1]
pc_components.columns = ["PC1", "PC2", "PC3", "PC4", "PC5", "PC6"]

# low-rank solution: r1=6
identity = pd.DataFrame(np.eye(r1, r1), index=pc_components.columns, columns = pc_components.columns)
# PCs are linearly independent by the definition
pc_columns = pd.concat([pc_components, identity], axis=0)

# inverse cov matrix extended by PCs
asv_pc = pd.concat([low, pc_components], axis=1)
asv_pc = pd.concat([asv_pc.T, pc_columns], axis=1)

asv_low = asv_pc.iloc[:-r1, -r1:]
# l1-norm of partial correlation between ASVs and PCs
l1_norm_pc = np.linalg.norm(asv_low.values, axis=1)

asv_pc['l1'] = np.append(l1_norm_pc, np.zeros(r1))
asv_pc = asv_pc.T
asv_pc['l1'] = np.append(l1_norm_pc, np.zeros(r1 + 1))

#sorting by the order of adaptive l1-norm sorted solution
n_asvs = len(vis_S)
sorted_asv = asv_pc.iloc[:n_asvs, :].reindex(index=adapt_theta.iloc[:n_asvs, :].index)
sorted_asv_pc = sorted_asv.T.join(asv_pc.iloc[:, -7:])
sorted_asv = sorted_asv_pc.iloc[:n_asvs, :].reindex(index=adapt_theta.iloc[:n_asvs, :].index)
sorted_l1_low = pd.concat([sorted_asv, sorted_asv_pc.iloc[n_asvs:, :]], axis=0)

In [None]:
lables_l1_low, re_labels_l1_low = create_label_dict(sorted_l1_low)

p_l1_low = _make_heatmap(data=sorted_l1_low, labels_dict=lables_l1_low, labels_dict_reversed=re_labels_l1_low,
                       title="Esatimated inverse covariance (sparse + low-rank) sorted by l1-norm of the PCs", width=width, height=height,
                       label_size=label_size)
show(p_l1_low)

### Analysis 2

In [None]:
width = 1500
height = 1500
label_size = "8pt"

# for visualization reasons we transform inverse covaraince to negative inverse covaraince, i.e., multiply by -1
sgl = -1 * pd.DataFrame(P_SGL.solution.precision_, columns=asv_names, index=asv_names)
adapt = -1 * pd.DataFrame(P_SGL_adapt.solution.precision_, columns=vis_df.columns, index=vis_df.columns)
low = -1 * pd.DataFrame(P_SGL_low.solution.precision_, columns=asv_names, index=asv_names)


lables_sgl, re_labels_sgl = create_label_dict(sgl)
lables_adapt, re_labels_adapt = create_label_dict(adapt)
lables_low, re_labels_low = create_label_dict(low)

p_sgl = _make_heatmap(data=sgl, labels_dict=lables_sgl, labels_dict_reversed=re_labels_sgl,
                       title="SGL estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

p_adapt = _make_heatmap(data=adapt, labels_dict=lables_adapt, labels_dict_reversed=re_labels_adapt,
                       title="Adaptive estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

p_low = _make_heatmap(data=low, labels_dict=lables_low, labels_dict_reversed=re_labels_low,
                       title="SGL+low-rank estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)
show(p_sgl)
show(p_adapt)
show(p_low)

In [None]:
# meta_cols = list(adapt.iloc[:, -n_cov:].columns)

asv69_edges = ["ASV_18", "ASV_51", "ASV_46", "ASV_13", "ASV_7", "ASV_5"]
asv69_edges_adapt = meta_cols + ["ASV_18", "ASV_51", "ASV_5", "ASV_46"]
asv69_edges_low = asv69_edges

sgl_edges = sgl[sgl.columns.intersection(asv69_edges)].loc[asv69_edges]
adapt_edges = adapt[adapt.columns.intersection(asv69_edges_adapt)].loc[asv69_edges_adapt]
low_edges = low[low.columns.intersection(asv69_edges_low)].loc[asv69_edges_low]


G_SGL = create_graph(sgl_edges, threshold=0.01)
G_adapt = create_graph(adapt_edges, threshold=0.01)
G_low = create_graph(low_edges, threshold=0.01)


width, height= 1000, 1000

network_sgl = plot_network(G_SGL, title="SGL", height=height, width=width)
network_adapt = plot_network(G_adapt, title="Adaptive",  height=height, width=width)
network_low = plot_network(G_low, title="Low-rank",  height=height, width=width)

show(network_sgl)
show(network_adapt)
show(network_low)