In [None]:
import pandas as pd
import numpy as np
import re
import bisect
import seaborn as sns
import scipy
import scipy.cluster.hierarchy as sch
import plotly.express as px
import matplotlib.pyplot as plt
import networkx as nx

from itertools import chain
from math import pi
from sklearn import preprocessing
from GGLasso.gglasso.problem import glasso_problem
from utils import transform_features, scale_array_by_diagonal
from utils import PCA

from scipy import stats
from scipy.spatial import distance

from networkx.utils import cuthill_mckee_ordering

from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, HoverTool, LabelSet, PointDrawTool
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import RdBu, Blues8
from bokeh.models import HoverTool, Panel, Tabs, ColorBar, LinearColorMapper
from bokeh.layouts import row

In [None]:
def PCA(X, L, inverse=True):
    sig, V = np.linalg.eigh(L)

    # sort eigenvalues in descending order
    sig = sig[::-1]
    V = V[:, ::-1]

    ind = np.argwhere(sig > 1e-9)

    if inverse:
        loadings = V[:, ind] @ np.diag(np.sqrt(1 / sig[ind]))
    else:
        loadings = V[:, ind] @ np.diag(np.sqrt(sig[ind]))

    # compute the projection
    zu = X.values @ loadings

    return zu, loadings, np.round(sig[ind].squeeze(), 3)

In [None]:
def _make_heatmap(data: pd.DataFrame(), title: str = None, labels_dict: dict=None, labels_dict_reversed: dict=None,
                  width: int = 1500, height: int = 1500, label_size: str = "5pt", not_low_rank: bool = True):
    nlabels = len(labels_dict)
    df = data.iloc[::-1] # rotate matrix 90 degrees
    df = pd.DataFrame(df.stack(), columns=['covariance']).reset_index()
    df.columns = ["taxa_y", "taxa_x", "covariance"]
    if not_low_rank:
        df = df.replace({"taxa_x": labels_dict, "taxa_y": labels_dict})

    color_list, colors = _get_colors(df=df)
    mapper = LinearColorMapper(palette=colors, low=-1, high=1)
    color_bar = ColorBar(color_mapper=mapper, location=(0, 0))

    bottom, top, left, right = _get_bounds(nlabels=nlabels)

    source = ColumnDataSource(dict(top=top, bottom=bottom, left=left, right=right, color_list=color_list,
                                   taxa_x=df['taxa_x'], taxa_y=df['taxa_y'], covariance=df['covariance']))

    bokeh_tools = ["save, zoom_in, zoom_out, wheel_zoom, box_zoom, crosshair, reset, hover"]

    p = figure(plot_width=width, plot_height=height, x_range=(0, nlabels), y_range=(0, nlabels),
               title=title, title_location='above', x_axis_location="below",
               tools=bokeh_tools, toolbar_location='left')

    p.quad(top="top", bottom="bottom", left="left", right="right", line_color='white', color="color_list",
           source=source)
    p.xaxis.major_label_orientation = pi / 4
    p.yaxis.major_label_orientation = "horizontal"
    p.title.text_font_size = "24pt"
    p.add_layout(color_bar, 'right')
    p.toolbar.autohide = True

    p.xaxis.ticker = list(range(0, nlabels))
    p.yaxis.ticker = list(range(0, nlabels))
    if not_low_rank:
        p.xaxis.major_label_overrides = labels_dict
        p.yaxis.major_label_overrides = labels_dict_reversed
    p.xaxis.major_label_text_font_size = label_size
    p.yaxis.major_label_text_font_size = label_size

    hover = p.select(dict(type=HoverTool))
    hover.tooltips = [
        ("taxa_x", "@taxa_x"),
        ("taxa_y", "@taxa_y"),
        ("covariance", "@covariance"),
    ]

    return p

In [None]:
def plot_network(G, title, width, height, node_size=None):
    #Establish which categories will appear when hovering over each node
    HOVER_TOOLTIPS = [("Character", "@index")]
    hover = HoverTool(tooltips=[('','@index')])
    tools = ["save, zoom_in, zoom_out, wheel_zoom, box_zoom, crosshair, reset, hover, pan"]

    #Create a plot — set dimensions, toolbar, and title
    plot = figure(tooltips = HOVER_TOOLTIPS, plot_width=width, plot_height=height,
                  tools=tools, active_scroll='wheel_zoom',
                x_range=Range1d(-10.1, 10.1), 
                  y_range=Range1d(-10.1, 10.1), title=title)
    
    
    
    color_map = ["#88CCEE" if "ASV" in j else "#DDCC77" for j in G.nodes()] #green for bugs, and blue for covariates
    nx.set_node_attributes(G, {j: {'color': color_map[i]} for i, j in enumerate(G.nodes())})

    if node_size is not None:
        n_degrees = {k: 15*v for k,v in G.degree()} 
        nx.set_node_attributes(G, n_degrees, 'node_size')
        node_size = 'node_size'
    else:
        node_size = 40

    network_graph = from_networkx(G, nx.spring_layout, scale=10, center=(0, 0))


    #Set node size and color
    network_graph.node_renderer.glyph = Circle(size=node_size,  fill_color="color")
    
    #Set edge width and color
    network_graph.edge_renderer.data_source.data["line_width"] = [G.get_edge_data(a,b)['covariance']*2 for a, b in G.edges()]  ### amplify edges strengh
    network_graph.edge_renderer.data_source.data["line_color"] = ["#117733" if G.get_edge_data(a, b)['covariance'] >= 0 else "#CC6677" for a, b in G.edges()]
    network_graph.edge_renderer.glyph.line_width = {'field': 'line_width'}
    network_graph.edge_renderer.glyph.line_color = {'field': 'line_color'}

    #Add network graph to the plot
    plot.renderers.append(network_graph)
    
    x, y = zip(*network_graph.layout_provider.graph_layout.values())
    node_labels = list(G.nodes)
    source = ColumnDataSource({'x': x, 'y': y, 'asv': [node_labels[i] for i in range(len(x))]})
    labels = LabelSet(x='x', y='y', text='asv', x_offset=30, y_offset=-15, source=source, render_mode='canvas', text_font_size='12pt')

    plot.renderers.append(labels)    

    return plot

In [None]:
def cluster_corr(corr_array, inplace=False):
    """
    Rearranges the correlation matrix, corr_array, so that groups of highly 
    correlated variables are next to eachother 
    
    Parameters
    ----------
    corr_array : pandas.DataFrame or numpy.ndarray
        a NxN correlation matrix 
        
    Returns
    -------
    pandas.DataFrame or numpy.ndarray
        a NxN correlation matrix with the columns and rows rearranged
    """
    pairwise_distances = sch.distance.pdist(corr_array)
    linkage = sch.linkage(pairwise_distances, method='complete')
    cluster_distance_threshold = pairwise_distances.max()/2
    idx_to_cluster_array = sch.fcluster(linkage, cluster_distance_threshold, 
                                        criterion='distance')
    idx = np.argsort(idx_to_cluster_array)
    
    if not inplace:
        corr_array = corr_array.copy()
    
    if isinstance(corr_array, pd.DataFrame):
        return corr_array.iloc[idx, :].T.iloc[idx, :]
    return corr_array[idx, :][:, idx]


# fig = px.imshow(-1*cluster_corr(precision_SGL), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
# fig.update_layout(margin = dict(t=100,r=100,b=100,l=100), width = 1000, height = 1000,
#                  title='Clustered Estimated inverse covariance: ASVs', title_x=0.5)

In [None]:
def create_graph(corr_matrix: pd.DataFrame(), threshold: float):
    #take the upper part only
    upper = np.triu(np.ones(corr_matrix.shape)).astype(bool)
    df = corr_matrix.where(upper)
    df = pd.DataFrame(corr_matrix.stack(), columns=['covariance']).reset_index()
    df.columns = ["source", "target", "covariance"]
    
    #remove diagonal entries
    #df = df[df['covariance'] <= threshold]
    df = df[abs(df['covariance']) >= threshold]
    #remove diagonal entries
    df = df[df['source'] != df['target']]
    #remove zero entries
    df = df[df['covariance'] != 0]
    
    #build graph
    G = nx.from_pandas_edgelist(df, edge_attr="covariance")
    
    return G

In [None]:
def project_covariates(counts=pd.DataFrame(), metadata=pd.DataFrame(), L=np.ndarray, y=str):
    proj, loadings, eigv = PCA(counts.dropna(), L, inverse=True)
    r = np.linalg.matrix_rank(L)
    eigv_sum = np.sum(eigv)
    var_exp = [(value / eigv_sum) for value in sorted(eigv, reverse=True)]
    
    depth = pd.DataFrame(data=raw.sum(axis=0), columns=["sequencing depth"])
    metadata = depth.join(metadata)
    
    pc_columns = list('PC{0} ({1}%)'.format(i+1, str(100 * var_exp[i])[:4]) for i in range(0, r))
    df_proj = pd.DataFrame(proj, columns=pc_columns, index=counts.index)
    df = df_proj.join(metadata)
    
    varName1 = 'PC1 ({0}%)'.format(str(100 * var_exp[0])[:4])
    varName2 = y
    df['x'] = df[varName1]
    df['y'] = df[varName2]

    source = ColumnDataSource(df)

    p0 = figure(tools='save, zoom_in, zoom_out, wheel_zoom, box_zoom, reset', plot_width=800, plot_height=800,
                active_scroll="wheel_zoom",
                x_axis_label=varName1, y_axis_label=varName2,
                tooltips=[(varName1, "@" + varName1),
                          (varName2, "@" + varName2)
                          ],
                title=varName1 + " vs " + varName2)

    exp_cmap = LinearColorMapper(palette=Blues8[::-1], low=min(df['sequencing depth'].values), high=max(df['sequencing depth'].values))
    p0.circle('x', 'y', source=source, size=15, line_color=None, fill_color={"field": "sequencing depth", "transform": exp_cmap}, fill_alpha=0.3)

    color_bar_plot = figure(title='sequencing depth', title_location="right",
                            height=500, width=150, toolbar_location=None, min_border=0,
                            outline_line_color=None)

    bar = ColorBar(color_mapper=exp_cmap, location=(1, 1))

    color_bar_plot.add_layout(bar, 'right')
    color_bar_plot.title.align = "center"
    color_bar_plot.title.text_font_size = '12pt'

    layout = row(p0, color_bar_plot)

    return layout

In [None]:
def add_labels(df):
    i = 1
    for col in df.columns:
        # length of ASVs identifier
        if len(col) == 32:
            asv_name = "ASV_{0}".format(i)
            id_dict[asv_name] = col
            df.rename(columns={col: asv_name}, inplace=True)

            i += 1
    return df

In [None]:
def _get_bounds(nlabels: int):
    bottom = list(chain.from_iterable([[ii] * nlabels for ii in range(nlabels)]))
    top = list(chain.from_iterable([[ii + 1] * nlabels for ii in range(nlabels)]))
    left = list(chain.from_iterable([list(range(nlabels)) for ii in range(nlabels)]))
    right = list(chain.from_iterable([list(range(1, nlabels + 1)) for ii in range(nlabels)]))

    return bottom, top, left, right

In [None]:
def _get_colors(df: pd.DataFrame(), n_colors: int = 9):
    colors = list(RdBu[n_colors])
    ccorr = np.arange(-1, 1, 1 / (len(colors) / 2))
    color_list = []
    for value in df.covariance.values:
        ind = bisect.bisect_left(ccorr, value) # smart array insertion
        if ind == 0: # avoid ind == -1 on the next step
            ind = ind + 1
        color_list.append(colors[ind-1])
    return color_list, colors

In [None]:
def create_label_dict(df):
    n_labels = len(df.columns)
    labels_dict = dict(zip(range(n_labels), df.columns))
    labels_dict_reversed = dict(zip(range(n_labels),list(labels_dict.values())[::-1]))
    
    return labels_dict, labels_dict_reversed

In [None]:
def scater_plot(x, y, width=800, height=600, size=3):
    bokeh_tools = ["save, zoom_in, zoom_out, wheel_zoom, box_zoom, crosshair, reset, hover"]
    p = figure(plot_width=width, plot_height=height, tools=bokeh_tools, toolbar_location='left')

    source = ColumnDataSource({'x': x, 'y': y})

    p.circle("x", "y", size=size, source=source, line_color=None)

    p.xaxis.axis_label = x.name
    p.yaxis.axis_label = y.name
    
    return p

# Data preprocessing

In [None]:
#count table
raw = pd.read_csv('data/composition_feature-table.tsv', sep='\t', index_col = 0)

# pattern = r'(\w+\.\d+\.\d+)' 
# columns = [col for col in raw.columns if re.match(pattern, col)]

# for prefix in set(col[:-4] for col in columns):
#     pref1 = f"{prefix}.1.1"
#     pref2 = f"{prefix}.1.2"
#     pref3 = f"{prefix}.1.3"
#     pref_avg = f"{prefix}.agg"

#     if pref1 in raw.columns and pref2 in raw.columns and pref3 in raw.columns:
#         # create a boolean mask to identify rows where pref1 has value zero and pref2 or pref3 has non-zero
#         mask = (raw[pref1] == 0) & ((raw[pref2] != 0) | (raw[pref3] != 0))
        
#         # create a new column pref_avg with default value np.nan
#         raw[pref_avg] = 0

#         # assign the maximum value among pref2 and pref3 to pref_avg on rows where the mask is True
#         raw.loc[mask, pref_avg] = raw.loc[mask, [pref2, pref3]].max(axis=1)
        
#         # drop pref1, pref2, and pref3
#         raw.drop([pref1, pref2, pref3], axis=1, inplace=True)

#     elif pref2 in raw.columns and pref3 in raw.columns:
#         # create a boolean mask to identify rows where pref2 has value zero and pref3 has non-zero
#         mask = (raw[pref2] == 0) & (raw[pref3] != 0)

#         # create a new column pref_avg with default value np.nan
#         raw[pref_avg] = 0

#         # assign the value of pref3 to pref_avg on rows where the mask is True
#         raw.loc[mask, pref_avg] = raw.loc[mask, pref3]
        
#         # drop pref2 and pref3
#         raw.drop([pref2, pref3], axis=1, inplace=True)

# # rename YUN1005.1.1 to YUN1005.1
# raw.rename(columns={"YUN1005.1.1": "YUN1005.agg"}, inplace=True)

print("Some columns contain only zeros:", (raw == 0).all().any())


# # calculate percentage of zeros for each row
percentage_zeros = (raw == 0).sum(axis=1) / len(raw.columns) * 100


# drop rows with more than 80% of zeros
raw_filt = raw[percentage_zeros <= 80]
print("Some columns contain only zeros:", (raw_filt == 0).all().any())

zero_cols = raw_filt.columns[(raw_filt == 0).all()]
print("{0} samples are dropped since there is no variance". format(zero_cols))
raw_filt = raw_filt.drop(zero_cols, axis=1)
print("Some columns contain only zeros:", (raw_filt == 0).all().any())
# print(percentage_zeros.to_string())

In [None]:
taxa = pd.read_csv(str("~/q2-gglasso/data/atacama-taxa/taxonomy.tsv"), index_col=0, sep='\t')

taxonomy_levels = {"domain": '^d__', 
                   "phylum": '^p__', 
                   "class": '^c__', 
                   "order": '^o__', 
                   "family": '^f__',
                   "genus": '^g__', 
                   "species": '^s__'
                  }

# split taxonomic ranks in different columns
taxa_sep = taxa['Taxon'].str.split(';', expand=True)

#rename taxonomic ranks with full names
taxa_sep.columns = taxonomy_levels.keys()

# drop missing species
taxa_sep = taxa_sep[taxa_sep.species.notnull()]

# remove blank spaces from taxonomic ranks
taxa_sep[taxa_sep.columns] = taxa_sep.apply(lambda x: x.str.strip())

taxa_sep.shape


taxa_dict = dict()

for level in taxa_sep.columns:
    df_level = raw.join(taxa_sep[level])
    df_level = df_level.groupby(level).sum()
    
    taxa_dict[level] = df_level
    
taxa_dict.keys()

phylum = taxa_dict['phylum']
print("Some columns contain only zeros:", (phylum == 0).all().any())

zero_cols = phylum.columns[(phylum == 0).all()]
print("{0} samples are dropped since there is no variance". format(zero_cols))
# select columns containing only zeros
zero_df = phylum[zero_cols]

phylum = phylum.drop(zero_cols, axis=1)

print("Some columns contain only zeros:", (phylum == 0).all().any())

In [None]:
#clr-transformation
# clr = transform_features(raw, transformation="mclr")
# clr = transform_features(phylum, transformation="mclr")
clr = transform_features(raw_filt, transformation="mclr")

In [None]:
meta = pd.read_csv('data/acm_meta.tsv', sep='\t', index_col = 0)

# select only numeric features
meta = meta.loc[:, meta.iloc[0, :] != 'categorical']
meta = meta.apply(pd.to_numeric, errors='coerce')
# drop QIIME2 header
meta = meta.iloc[1:]
# fill missing values with zeros
meta = meta.fillna(0)

# meta_agg = meta.loc[columns]

# # Group the rows by the first part of the index
# grouped = meta_agg.groupby(meta_agg.index.str.split('.').str[0])

# # Calculate the mean of each group
# means = grouped.mean()

# # # Create a new DataFrame with the means and the old indices
# meta_agg = pd.DataFrame(means.values, index=means.index, columns=means.columns)
# meta_agg.index = meta_agg.index + '.agg'

# meta  = pd.concat([meta, meta_agg])

#scale data
scaler = preprocessing.StandardScaler().fit(meta)
meta_scaled = scaler.transform(meta)
meta_scaled = pd.DataFrame(meta_scaled, index=meta.index, columns=meta.columns)

# transpose count data
clr_T = clr.T
# join by sample id
df = clr_T.join(meta_scaled)

# Rename long feature IDs with concise names
vis_df = df.copy()
id_dict = dict()
vis_df = add_labels(vis_df)

#calculate covariance
n_cov = meta_scaled.shape[1]
asv = df.iloc[:, :-n_cov]
S = np.cov(asv.T.values, bias=True)

# correlation between ASVs ONLY
corr = scale_array_by_diagonal(S)

#add labels
asv_names = vis_df.iloc[:, :-n_cov].columns
vis_S = pd.DataFrame(corr, columns=asv_names, index=asv_names)

# # correlation between ASVs and covariates
S_meta = np.cov(df.T.values, bias=True)
corr_meta = scale_array_by_diagonal(S_meta)
vis_S_meta = pd.DataFrame(corr_meta, columns=vis_df.columns, index=vis_df.columns)

In [None]:
width = 1500
height = 1500
label_size = "8pt"
lables_0, re_labels_0 = create_label_dict(vis_S)

p0 = _make_heatmap(data=vis_S, labels_dict=lables_0, labels_dict_reversed=re_labels_0,
                       title="Correlation: ASVs", width=width, height=height,
                       label_size=label_size)

meta_corr = vis_S_meta.iloc[-n_cov:, -n_cov:]
lables_1, re_labels_1 = create_label_dict(meta_corr)

p1 = _make_heatmap(data=meta_corr, labels_dict=lables_1, labels_dict_reversed=re_labels_1,
                       title="Correlation: covariates", width=width, height=height,
                       label_size=label_size)

# drop highly correlated covariates
hcorr_cov = ['relative-humidity-soil-high', 'relative-humidity-soil-low', 'percent-relative-humidity-soil-100', 'temperature-soil-high', 'temperature-soil-low']

for frame in [vis_S_meta, df, vis_df]:
    frame.drop(hcorr_cov, axis=1, inplace=True)
    frame.rename(columns={'average-soil-relative-humidity':'average humidity','average-soil-temperature': 'average temperature',}, inplace=True)

vis_S_meta = vis_S_meta.T
vis_S_meta.drop(hcorr_cov, axis=1, inplace=True)
vis_S_meta.rename(columns={'average-soil-relative-humidity':'average humidity','average-soil-temperature': 'average temperature',}, inplace=True)

n_cov = df.shape[1] - asv.shape[1]
lables_2, re_labels_2 = create_label_dict(vis_S_meta)

p2 = _make_heatmap(data=vis_S_meta, labels_dict=lables_2, labels_dict_reversed=re_labels_2,
                       title="Correlation: ASVs + covariates", width=width, height=height,
                       label_size=label_size)

show(p0)
show(p1)
show(p2)

# Models

In [None]:
N = asv.shape[0]
p = asv.shape[1]
print("Shape of data without covariates: {0}, {1}".format(N, p))

N_meta = df.shape[0]
p_meta = df.shape[1]
print("Shape of data with covariates: {0}, {1}".format(N_meta, p_meta))

#hyperparameters
# lambda1_range = np.logspace(0, -4, 15)
# mu1_range = np.logspace(0.9, 0.4, 10)
# lambda1_range = np.logspace(-2, -4, 20)
# mu1_range = np.logspace(-2, -4, 20)
lambda1_range = np.logspace(0, -1, 15)
mu1_range = np.logspace(-0.5, -4, 10)
modelselect_params = {'lambda1_range': lambda1_range, 'mu1_range': mu1_range}

P_SGL = glasso_problem(corr, N, latent=False, do_scaling=False)
P_SGL.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=0.1)

P_SGL_low = glasso_problem(corr, N, latent=True, do_scaling=False)
P_SGL_low.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=0.1)

# create lambda matrix full of zeros
shape_meta = (p_meta, p_meta)
mask = np.zeros(shape_meta)
# add small constant, so ADMM could converge
mask = mask + 0.01
# heavy penalize species
n_bugs = len(asv.columns)
bugs_block = np.ones((n_bugs, n_bugs))
mask[0:n_bugs, 0:n_bugs] += bugs_block - 0.01
lambda1_mask_exp = mask
df_mask_exp = pd.DataFrame(lambda1_mask_exp, columns=vis_df.columns, index=vis_df.columns)

modelselect_params["lambda1_mask"] = lambda1_mask_exp
P_SGL_adapt = glasso_problem(vis_S_meta.values, N_meta, latent=False, do_scaling=False)
P_SGL_adapt.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=0.1)

In [None]:
print("SGL solution with lambda={lambda1} and mu={mu1}".format(**P_SGL.reg_params))
print("Adaptive SGL+low-rank solution with lambda={lambda1} and mu={mu1}".format(**P_SGL_adapt.reg_params))
print("SGL+low-rank solution with lambda={lambda1} and mu={mu1}".format(**P_SGL_low.reg_params))

In [None]:
P_SGL_low.modelselect_stats

# Analysis

In [None]:
width = 1500
height = 1500
label_size = "8pt"

# for visualization reasons we transform inverse covaraince to negative inverse covaraince, i.e., multiply by -1
sgl = -1 * pd.DataFrame(P_SGL.solution.precision_, columns=asv_names, index=asv_names)
adapt = -1 * pd.DataFrame(P_SGL_adapt.solution.precision_, columns=vis_df.columns, index=vis_df.columns)
low = -1 * pd.DataFrame(P_SGL_low.solution.precision_, columns=asv_names, index=asv_names)


lables_sgl, re_labels_sgl = create_label_dict(sgl)
lables_adapt, re_labels_adapt = create_label_dict(adapt)
lables_low, re_labels_low = create_label_dict(low)

p_sgl = _make_heatmap(data=sgl, labels_dict=lables_sgl, labels_dict_reversed=re_labels_sgl,
                       title="SGL estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

p_adapt = _make_heatmap(data=adapt, labels_dict=lables_adapt, labels_dict_reversed=re_labels_adapt,
                       title="Adaptive estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

p_low = _make_heatmap(data=low, labels_dict=lables_low, labels_dict_reversed=re_labels_low,
                       title="SGL+low-rank estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)
show(p_sgl)
show(p_adapt)
show(p_low)

In [None]:
# meta_cols = list(adapt.iloc[:, -n_cov:].columns)
# asv18_edges = ["ASV_18", "ASV_51", "ASV_46", "ASV_13", "ASV_7", "ASV_5"]
# asv18_51 =["ASV_18", "ASV_51"]
# asv18_edges_adapt = meta_cols + asv18_51

# sgl_edges = sgl[sgl.columns.intersection(asv18_edges)].loc[asv18_edges]
# adapt_edges = adapt[adapt.columns.intersection(asv18_edges_adapt)].loc[asv18_51]
# low_edges = low[low.columns.intersection(asv18_edges)].loc[asv18_edges]


# G_SGL = create_graph(sgl_edges, threshold=0.1)
# G_adapt = create_graph(adapt_edges, threshold=0.1)
# G_low = create_graph(low_edges, threshold=0.1)


G_SGL = create_graph(sgl, threshold=0.0001)
G_adapt = create_graph(adapt, threshold=0.0001)
G_low = create_graph(low, threshold=0.0001)


width, height= 1000, 1000

network_sgl = plot_network(G_SGL, title="SGL", height=height, width=width)
network_adapt = plot_network(G_adapt, title="Adaptive",  height=height, width=width)
network_low = plot_network(G_low, title="Low-rank",  height=height, width=width)

show(network_sgl)
show(network_adapt)
show(network_low)

In [None]:
p_18_51 = scater_plot(vis_df["ASV_1"], vis_df["ASV_10"])
# p_18_temp = scater_plot(vis_df["p__Gemmatimonadota"], vis_df["ph"])
# p_51_temp = scater_plot(vis_df["p__Actinobacteriota"], vis_df["ph"])

show(p_18_51)
# show(p_18_temp)
# show(p_51_temp)

In [None]:
inv_cov = adapt.iloc[:-n_cov, -n_cov:]

L_adapt = inv_cov @ inv_cov.T
L_adapt.shape

L_1 = pd.DataFrame(P_SGL_low.solution.lowrank_, columns=asv_names, index=asv_names)
L_2 = pd.DataFrame(L_adapt, columns=asv_names, index=asv_names)

r1 = np.linalg.matrix_rank(L_1)
r2 = np.linalg.matrix_rank(L_2)

print("L1-rank: {0}".format(r1))
print("L2-rank: {0}".format(r2))

proj_1, loadings_1, eigv_1 = PCA(asv, L_1, inverse=True)

eigv_sum_1 = np.sum(eigv_1)
var_exp_1 = [(value / eigv_sum_1) for value in sorted(eigv_1, reverse=True)]

proj_2, loadings_2, eigv_2 = PCA(asv, L_2, inverse=True)

eigv_sum_2 = np.sum(eigv_2)
var_exp_2 = [(value / eigv_sum_2) for value in sorted(eigv_2, reverse=True)]

pca_plot = project_covariates(asv, metadata=meta_scaled, L=L_1, y='average-soil-temperature')
show(pca_plot)

In [None]:
width = 1500
height = 1500
label_size = "8pt"

adapt_theta = adapt.copy()

asv_cov = adapt_theta.iloc[:-n_cov, -n_cov:]

l1_norm = np.linalg.norm(asv_cov.values, axis=1)

adapt_theta['l1'] = np.append(l1_norm, np.zeros(n_cov))

adapt_theta = adapt_theta.T

adapt_theta['l1'] = np.append(l1_norm, np.zeros(n_cov+1))
adapt_theta = adapt_theta.sort_values(by=['l1'], ascending=False)
adapt_theta = adapt_theta.T
adapt_theta = adapt_theta.sort_values(by=['l1'], ascending=False)

lables_l1, re_labels_l1 = create_label_dict(adapt_theta)

p_l1 = _make_heatmap(data=adapt_theta, labels_dict=lables_l1, labels_dict_reversed=re_labels_l1,
                       title="Esatimated inverse covariance sorted by l1-norm of the covariates", width=width, height=height,
                       label_size=label_size)
show(p_l1)

In [None]:
pc_components = pd.DataFrame(loadings_1, index=low.index)
pc_components = pc_components.iloc[::-1]
pc_components.columns = ["PC1", "PC2", "PC3", "PC4", "PC5", "PC6"]

# low-rank solution: r1=6
identity = pd.DataFrame(np.eye(r1, r1), index=pc_components.columns, columns = pc_components.columns)
# PCs are linearly independent by the definition
pc_columns = pd.concat([pc_components, identity], axis=0)

# inverse cov matrix extended by PCs
asv_pc = pd.concat([low, pc_components], axis=1)
asv_pc = pd.concat([asv_pc.T, pc_columns], axis=1)

asv_low = asv_pc.iloc[:-r1, -r1:]
# l1-norm of partial correlation between ASVs and PCs
l1_norm_pc = np.linalg.norm(asv_low.values, axis=1)

asv_pc['l1'] = np.append(l1_norm_pc, np.zeros(r1))
asv_pc = asv_pc.T
asv_pc['l1'] = np.append(l1_norm_pc, np.zeros(r1 + 1))

#sorting by the order of adaptive l1-norm sorted solution
n_asvs = len(vis_S)
sorted_asv = asv_pc.iloc[:n_asvs, :].reindex(index=adapt_theta.iloc[:n_asvs, :].index)
sorted_asv_pc = sorted_asv.T.join(asv_pc.iloc[:, -7:])
sorted_asv = sorted_asv_pc.iloc[:n_asvs, :].reindex(index=adapt_theta.iloc[:n_asvs, :].index)
sorted_l1_low = pd.concat([sorted_asv, sorted_asv_pc.iloc[n_asvs:, :]], axis=0)

In [None]:
lables_l1_low, re_labels_l1_low = create_label_dict(sorted_l1_low)

p_l1_low = _make_heatmap(data=sorted_l1_low, labels_dict=lables_l1_low, labels_dict_reversed=re_labels_l1_low,
                       title="Esatimated inverse covariance (sparse + low-rank) sorted by l1-norm of the PCs", width=width, height=height,
                       label_size=label_size)
show(p_l1_low)

### Analysis 2

In [None]:
width = 1500
height = 1500
label_size = "8pt"

# for visualization reasons we transform inverse covaraince to negative inverse covaraince, i.e., multiply by -1
sgl = -1 * pd.DataFrame(P_SGL.solution.precision_, columns=asv_names, index=asv_names)
adapt = -1 * pd.DataFrame(P_SGL_adapt.solution.precision_, columns=vis_df.columns, index=vis_df.columns)
low = -1 * pd.DataFrame(P_SGL_low.solution.precision_, columns=asv_names, index=asv_names)


lables_sgl, re_labels_sgl = create_label_dict(sgl)
lables_adapt, re_labels_adapt = create_label_dict(adapt)
lables_low, re_labels_low = create_label_dict(low)

p_sgl = _make_heatmap(data=sgl, labels_dict=lables_sgl, labels_dict_reversed=re_labels_sgl,
                       title="SGL estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

p_adapt = _make_heatmap(data=adapt, labels_dict=lables_adapt, labels_dict_reversed=re_labels_adapt,
                       title="Adaptive estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)

p_low = _make_heatmap(data=low, labels_dict=lables_low, labels_dict_reversed=re_labels_low,
                       title="SGL+low-rank estimated (negative) inverse covariance", width=width, height=height,
                       label_size=label_size)
show(p_sgl)
show(p_adapt)
show(p_low)

In [None]:
# meta_cols = list(adapt.iloc[:, -n_cov:].columns)

asv69_edges = ["ASV_18", "ASV_51", "ASV_46", "ASV_13", "ASV_7", "ASV_5"]
asv69_edges_adapt = meta_cols + ["ASV_18", "ASV_51", "ASV_5", "ASV_46"]
asv69_edges_low = asv69_edges

sgl_edges = sgl[sgl.columns.intersection(asv69_edges)].loc[asv69_edges]
adapt_edges = adapt[adapt.columns.intersection(asv69_edges_adapt)].loc[asv69_edges_adapt]
low_edges = low[low.columns.intersection(asv69_edges_low)].loc[asv69_edges_low]


G_SGL = create_graph(sgl_edges, threshold=0.01)
G_adapt = create_graph(adapt_edges, threshold=0.01)
G_low = create_graph(low_edges, threshold=0.01)


width, height= 1000, 1000

network_sgl = plot_network(G_SGL, title="SGL", height=height, width=width)
network_adapt = plot_network(G_adapt, title="Adaptive",  height=height, width=width)
network_low = plot_network(G_low, title="Low-rank",  height=height, width=width)

show(network_sgl)
show(network_adapt)
show(network_low)