In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import plotly.express as px

from gglasso.helper.utils import sparsity, zero_replacement, normalize, log_transform
from gglasso.problem import glasso_problem
from gglasso.helper.basic_linalg import scale_array_by_diagonal

from sklearn import preprocessing

In [None]:
def robust_PCA(X, L, inverse=True):
    sig, V = np.linalg.eigh(L)

    # sort eigenvalues in descending order
    sig = sig[::-1]
    V = V[:,::-1]

    ind = np.argwhere(sig > 1e-9)

    if inverse:
        loadings = V[:,ind] @ np.diag(np.sqrt(1/sig[ind]))
    else:
        loadings = V[:,ind] @ np.diag(np.sqrt(sig[ind]))

    # compute the projection
    zu = X.values.T @ loadings

    return zu, loadings, np.round(sig[ind].squeeze(),3)

In [None]:
soil = pd.read_csv('~/GGLasso/data/soil/processed/soil_116.csv', sep=',', index_col = 0).T
print(soil.head())

X = normalize(soil)

X = log_transform(X)
(p,N) = X.shape

print("Shape of the transformed data: (p,N)={0},{1}".format(p,N))

In [None]:
ph = pd.read_csv('~/GGLasso/data/soil/processed/ph.csv', sep=',', index_col = 0)
ph = ph.reindex(soil.columns)
print(ph.head())

depth = soil.sum(axis=0)

metadata = pd.read_table('~/GGLasso/data/soil/original/88soils_modified_metadata.txt', index_col=0)

temperature = metadata["annual_season_temp"].reindex(ph.index)

In [None]:
S0 = np.cov(X.values, bias = True)
S = scale_array_by_diagonal(S0)


P = glasso_problem(S, N, latent = True, do_scaling = False)
print(P)

lambda1_range = np.logspace(0.5,-1.5,8)
mu1_range = np.logspace(1.5,-0.2,6)

modelselect_params = {'lambda1_range': lambda1_range, 'mu1_range': mu1_range}

P.model_selection(modelselect_params = modelselect_params, method = 'eBIC', gamma = 0.25)

print(P.reg_params)

In [None]:
L = P.solution.lowrank_
proj, loadings, eigv = robust_PCA(X, L, inverse=True)
r = np.linalg.matrix_rank(L)
print("RANK:{0}".format(r))

In [None]:
fig, ax = plt.subplots(1,1)
im = ax.scatter(proj[:,0], ph, c = depth, cmap = plt.cm.Blues, vmin = 0)
cbar = fig.colorbar(im)
cbar.set_label("Sampling depth")
ax.set_xlabel(f"PCA component 1 with eigenvalue {eigv[0]}")
ax.set_ylabel("pH")

print("Spearman correlation between pH and 1st component: {0}, p-value: {1}".format(stats.spearmanr(ph, proj[:,0])[0],
                                                                              stats.spearmanr(ph, proj[:,0])[1]))

In [None]:
fig, ax = plt.subplots(1,1)
im = ax.scatter(proj[:,1], temperature, c = depth, cmap = plt.cm.Blues, vmin = 0)
cbar = fig.colorbar(im)
cbar.set_label("Sampling depth")
ax.set_xlabel(f"PCA component 2 with eigenvalue {eigv[1]}")
ax.set_ylabel("Temperature")


print("Spearman correlation between temperature and 2nd component: {0}, p-value: {1}".format(stats.spearmanr(temperature, proj[:,1])[0],
                                                                              stats.spearmanr(temperature, proj[:,1])[1]))


In [None]:
temperature.shape, ph.shape

In [None]:
meta = ph.join(temperature)
meta.head()

# Adaptive lasso

In [None]:
X0 = X.T.join(meta)
X_meta = X0.T
X_meta.shape

In [None]:
S0_meta = np.cov(X_meta.values, bias = True)
S_meta = scale_array_by_diagonal(S0_meta)

In [None]:
S_meta.shape

In [None]:
fig = px.imshow(S_meta, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(margin = dict(t=100,r=100,b=100,l=100), width = 800, height = 800,
                 title='Covariance: ASVs and covariates', title_x=0.5)

In [None]:
# create lambda matrix full of zeros
p_meta = S_meta.shape[0]
shape_meta = (p_meta, p_meta)
mask = np.zeros(shape_meta)

# add small constant, so ADMM could converge
mask = mask + 0.001

# heavy penalize species
n_bugs = X.shape[0]
bugs_block = np.ones((n_bugs, n_bugs))
mask[0:n_bugs, 0:n_bugs] += bugs_block - 0.001
lambda1_mask_exp = mask

In [None]:
lambda1_mask_exp

lambda1_range = np.logspace(0.5,-2.5,8)
mu1_range = np.logspace(1.5,-1.2,6)

In [None]:
modelselect_params['lambda1_mask'] = lambda1_mask_exp
modelselect_params['lambda1_range'] = lambda1_range
modelselect_params['mu1_range'] = mu1_range

modelselect_params

In [None]:
P_SGL_adapt = glasso_problem(S_meta, N, latent=False, do_scaling=False)
print(P_SGL_adapt)

In [None]:
P_SGL_adapt.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=0.1)
print(P_SGL_adapt.reg_params)

In [None]:
for stat in ['AIC', 'SP', 'LAMBDA', 'BEST']:
    print(stat)
    print(P_SGL_adapt.modelselect_stats[stat])

In [None]:
precision_adapt = pd.DataFrame(P_SGL_adapt.solution.precision_, columns=X_meta.index, index=X_meta.index)
precision_adapt.head()

In [None]:
fig = px.imshow(-1*precision_adapt, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(margin = dict(t=100,r=100,b=100,l=100), width = 1000, height = 1000,
                 title='Esatimated inverse covariance (adaptive)', title_x=0.5)

In [None]:
inv_cov = precision_adapt.iloc[:-2, -2:]

fig = px.imshow(inv_cov, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(margin = dict(t=100,r=100,b=100,l=100), width = 600, height = 3000,
                 title='Negative inverse covariance between ASVs and covariates', title_x=0.5)

In [None]:
# plot PCs vs covariates

eigv_sum = np.sum(eigv)
var_exp = [(value / eigv_sum) for value in sorted(eigv, reverse=True)]


x = X.sum(axis=0)
seq_depth = pd.DataFrame(data=x, columns=["sequencing depth"])


test_df = X_meta.copy()
test_df = test_df.T.join(seq_depth)

rank = r

for col in inv_cov.columns:

    for i in range(0, rank):
        
        x = loadings[:,i]
        y = inv_cov[col]
        
        
        r_2 = stats.spearmanr(x, y)[0]
        p_value = stats.spearmanr(x, y)[1]
        title_1 = "Spearman correlation {0}".format(np.round(r_2, 3))

        # Find the slope and intercept of the best fit line
        slope, intercept = np.polyfit(x, y, 1)

        # Create a list of values in the best fit line
        abline_values = [slope * i + intercept for i in x]

        fig, ax = plt.subplots(nrows = 1, ncols = 2, sharex=False, sharey = False, squeeze=False, figsize=(15, 7))
        ax[0][0].scatter(x, y)
        ax[0][0].plot(x, abline_values, 'b')
        ax[0][0].set_xlabel("Principal axis (loading vector) {0}".format(i+1))
        ax[0][0].set_ylabel("estimated {0}".format(col))
        ax[0][0].title.set_text(title_1)
        
        yabs_max = abs(max(ax[0][0].get_ylim(), key=abs))
        xabs_max = abs(max(ax[0][0].get_xlim(), key=abs))
        ax[0][0].set_ylim(ymin=-yabs_max, ymax=yabs_max)
        ax[0][0].set_xlim(xmin=-xabs_max, xmax=xabs_max)

        spearman_corr = stats.spearmanr(test_df[col], proj[:, i])[0]
        p_value = stats.spearmanr(test_df[col], proj[:, i])[1]

        title_2 = "Spearman correlation {0}".format(np.round(spearman_corr, 3))
        im = ax[0][1].scatter(proj[:, i], test_df[col], c=test_df['sequencing depth'], cmap=plt.cm.Blues, vmin=0)
        cbar = fig.colorbar(im)


        cbar.set_label("Sampling depth")
        ax[0][1].set_xlabel("PC{0} ({1}%)".format(i + 1, str(100 * var_exp[i])[:4]))
        ax[0][1].set_ylabel("{0}".format(col))
        ax[0][1].title.set_text(title_2)
        
#         plt.show()
# #         plt.savefig("plots/pc_plots/scatter_pc_{0}_{1}.png".format(i, col))