In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objects as go

from sklearn import preprocessing
from latentcor import get_tps, latentcor
%matplotlib inline

In [None]:
!pip uninstall latentcor

In [None]:
def normalize(X):
    """
    transforms to the simplex
    X should be of a pd.DataFrame of form (p,N)
    """
    return X / X.sum(axis=0)

In [None]:
def geometric_mean(x, positive=False):
    """
    calculates the geometric mean of a vector
    """
    assert not np.all(x == 0)

    if positive:
        x = x[x > 0]
    a = np.log(x)
    g = np.exp(a.sum() / len(a))
    return g

In [None]:
def log_transform(X, transformation=str, eps=0.1):
    """
    log transform, scaled with geometric mean
    X should be a pd.DataFrame of form (p,N)
    """
    if transformation == "clr":
        assert not np.any(X.values == 0), "Add pseudo count before using clr"
        g = X.apply(geometric_mean)
        Z = np.log(X / g)
    elif transformation == "mclr":
        g = X.apply(geometric_mean, positive=True)
        X_pos = X[X > 0]
        Z = np.log(X_pos / g)
        Z = Z + abs(np.nanmin(Z.values)) + eps
        Z = Z.fillna(0)
    return Z

In [None]:
def transform_features(X: pd.DataFrame, transformation: str = "clr", pseudo_count: int = 1) -> pd.DataFrame:
    """
    Project compositional data to Euclidean space.

    Parameters
    ----------
    pseudo_count: int, optional
        Add pseudo count, only necessary for transformation = "clr".
    table: biom.Table
        A table with count microbiome data.
    transformation: str
        If 'clr' the data is transformed with center log-ratio method by Aitchison (1982).
        If 'mclr' the data is transformed with modified center log-ratio method by Yoon et al. (2019).

    Returns
    -------
    X: pd.Dataframe
        Count data projected to Euclidean space.

    """
    columns = X.columns

    if transformation == "clr":
        X = zero_imputation(X, pseudo_count=pseudo_count)
        X = normalize(X)
        X = log_transform(X, transformation=transformation)

        return pd.DataFrame(X, columns=columns)

    elif transformation == "mclr":
        X = normalize(X)
        X = log_transform(X, transformation=transformation)

        return pd.DataFrame(X, columns=columns)

    else:
        raise ValueError(
            "Unknown transformation name, use clr and not %r" % transformation
        )

In [None]:
def corr_heatmap(corr_df, title=None, show_plot=False):
    """
    Visualize correlation matrix with heatmap.
    
    Parameters
    ----------
    corr_df: pd.Series
        Pandas dataframe representing correlation matrix (symmetric).
    title: str, optional
        Title of the plot.
    show_plot: bool, optional
        Show the plot as an output.

    Returns
    -------
    fig: plotly.go object
        Heatmap figure.
    
    """

    heat = go.Heatmap(
        z = corr_df,
        x = corr_df.columns.values,
        y = corr_df.columns.values,
        zmin = - 1, # Sets the lower bound of the color domain
        zmax = 1,
        xgap = 1, # Sets the horizontal gap (in pixels) between bricks
        ygap = 1,
        colorscale = 'RdBu_r'
    )

    layout = go.Layout(
        title_text=title, 
        title_x=0.5, 
        width=2400, 
        height=2400,
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        yaxis_autorange='reversed'
    )

    fig=go.Figure(data=[heat], layout=layout)
    if show_plot:
        fig.show()
    
    return fig

# Atacama soil microbiome

## Import count data

Original data has been published by [Christian L Lauber](https://pubmed.ncbi.nlm.nih.gov/19502440/).

We have preprocessed the data with [Qiime2](https://github.com/Vlasovets/q2-gglasso/blob/master/example/atacama/atacama_example.ipynb) which resulted in having 53 samples and 130 ASVs.

In [None]:
acm_soil_counts = pd.read_csv('~/q2-gglasso/data/atacama-table_org/composition_feature-table.tsv', sep='\t', index_col = 0)
acm_soil_counts

Microbial count data is zero-inflated.

In [None]:
acm_soil_counts.iloc[:, 0].plot.hist(bins=24, alpha=1).get_figure().savefig('org_count.png')

It is also compositional and we apply mclr-transformation to avoid unit-sum constraint in the sample vectors.

In [None]:
acm = transform_features(acm_soil_counts, transformation="mclr")

In [None]:
acm.iloc[:, 0].plot.hist(bins=24, alpha=1).get_figure().savefig('mclr_count.png')

## Covariates

There are 15 numeric covariates associated with microbial count data.

In [None]:
# covariates
meta = pd.read_csv('~/GGLasso/data/soil/processed/acm_meta.tsv', sep='\t', index_col = 0)

meta = meta.loc[:, meta.iloc[0, :] != 'categorical'] # take numeric features
meta = meta.apply(pd.to_numeric, errors='coerce') # make sure every feature has a numeric type
meta = meta.dropna(how='all') # drop entire empty columns if any

# meta = meta.iloc[1:]
meta

In [None]:
fig, axis = plt.subplots(5,3,figsize=(15, 20))
meta.hist(ax=axis)
# fig.savefig('meta_hist.png')

The covariates have different scale and we need to standartise with $mu=0$ and $\sigma=1$.

In [None]:
scaler = preprocessing.StandardScaler().fit(meta)
scaled = scaler.transform(meta)

In [None]:
meta_scaled = pd.DataFrame(scaled, index=meta.index, columns=meta.columns)

fig, axis = plt.subplots(5,3,figsize=(15, 20))
meta_scaled.hist(ax=axis)

# fig.savefig('meta_hist_scaled.png')

## Merge counts and covariates

Now, we merge counts and covariates by sample ID.

In [None]:
# join by sample id
acm_T = acm.T 

df = acm_T.join(meta_scaled) # left join

df.isnull().sum().any() # check missing values

There are missing values which will prevent us calculating latent correlation, so impute them with zeros.

In [None]:
df = df.fillna(0)

df.isnull().sum().any() # check missing values

We also need to drop features with no variance if any to be able to calculate the correlation.

In [None]:
# drop covariates with zero variance
for var in df.columns:
    if df[var].var() == 0:
        print("'{0}' covariate has been dropped".format(var))
        del df[var]

In [None]:
# N, p
df.shape

We rename ASVs features indices with shorter names for visualization purposes.

In [None]:
vis_df = df.copy()

In [None]:
# Rename long feature IDs with concise names
id_dict = dict()

i = 1
for col in vis_df.columns:
    # length of ASVs identifier
    if len(col) == 32:
        asv_name = "ASV_{0}".format(i)
        id_dict[asv_name] = col
        vis_df.rename(columns={col: asv_name}, inplace=True)
        
        i += 1
        
vis_df.head()

# Latent correlation with latentcorr

We automatically extract mixed types from the data using get_tps() method.

In [None]:
N, p = df.shape

clean_types = get_tps(vis_df)

The count table is supposed to be truncated type, but since we have few samples and the data is strongly zero-inflated some of the features considered to be binary or ternary. Covariates as expected mostly have continious type.

In [None]:
print(clean_types)

## Original method

In [None]:
### N, p input
org_lat_cor = latentcor(vis_df, tps=clean_types, method='original', use_nearPD=False)

In [None]:
R_org = org_lat_cor['R']

## Appoximate method

In [None]:
approx_pdTRUE = latentcor(vis_df, tps=clean_types, method='approx', use_nearPD=True, nu=0.001, tol=1e-4)

In [None]:
R_pdTRUE = approx_pdTRUE['R']

In [None]:
approx_pdFALSE = latentcor(vis_df, tps=clean_types, method='approx', use_nearPD=False, nu=0.001, tol=1e-4)

In [None]:
R_pdFALSE = approx_pdFALSE["R"]

# Heatmaps

The latent correlation matrices are very unlike to Kendall correlation matrix.

In [None]:
kendall = vis_df.corr(method='kendall')

In [None]:
kendall_fig = corr_heatmap(kendall,title="Kendall correlation", show_plot=True)
# kendall_fig.write_image("kendall.png")

In [None]:
R_org_fig = corr_heatmap(R_org, title="Orgiginal latent correlation correlation", show_plot=True)
# R_org_fig.write_image("orgiginal_latent_correlation.png")

In [None]:
R_pdTRUE_fig = corr_heatmap(R_pdTRUE, title="Approximate latent correlation correlation with PD constraint", show_plot=True)
# R_pdTRUE_fig.write_image("approx_latent_correlation_PD.png")

In [None]:
R_pdFALSE_fig = corr_heatmap(R_pdFALSE, title="Approximate latent correlation correlation WITHOUT PD constraint", show_plot=True)
# R_pdFALSE_fig.write_image("approx_latent_correlation_noPD.png")

# Difference between latent correlation and Kendall correlation

In [None]:
diff_ken_org = kendall - R_org

diff_fig_org = corr_heatmap(diff_ken_org, title="Difference between Kendall and original latent correlation", show_plot=True)
# diff_fig_org.write_image("diff_ken_org.png")

In [None]:
diff_ken_apx_PD = kendall - R_pdTRUE

diff_fig_apx_PD = corr_heatmap(diff_ken_apx_PD, title="Difference between Kendall and approximate latent correlation with PD constraint", show_plot=True)
# diff_fig_apx_PD.write_image("diff_ken_apx_PD.png")

In [None]:
diff_ken_apx_noPD = kendall - R_pdFALSE

diff_fig_apx_noPD = corr_heatmap(diff_ken_apx_noPD, title="Difference between Kendall and approximate latent correlation WITHOUT PD constraint", show_plot=True)
# diff_fig_apx_noPD.write_image("diff_ken_apx_noPD.png")

# Comparison of eigenvalue decomposotion

In [None]:
eig_val_ken = np.linalg.eigvalsh(kendall)
eig_val_R_org = np.linalg.eigvalsh(R_org)
eig_val_R_pdTRUE = np.linalg.eigvalsh(R_pdTRUE)
eig_val_R_pdFALSE = np.linalg.eigvalsh(R_pdFALSE)


print("Kendall eigenvalue range:[{0}; {1}]".format(eig_val_ken.min(), eig_val_ken.max()))
print("Original eigenvalue range:[{0}; {1}]".format(eig_val_R_org.min(), eig_val_R_org.max()))
print("Approximate_PD eigenvalue range:[{0}; {1}]".format(eig_val_R_pdTRUE.min(), eig_val_R_pdTRUE.max()))
print("Approximate_noPD eigenvalue range:[{0}; {1}]".format(eig_val_R_pdFALSE.min(), eig_val_R_pdFALSE.max()))

In [None]:
x0 = eig_val_ken
x1 = eig_val_R_org
x2 = eig_val_R_pdTRUE
x3 = eig_val_R_pdFALSE

eigen_fig = go.Figure()
eigen_fig.add_trace(go.Histogram(x=x0, name='kendall', nbinsx=50))
eigen_fig.add_trace(go.Histogram(x=x1, name='original', nbinsx=50))
eigen_fig.add_trace(go.Histogram(x=x2, name='approx_PD', nbinsx=50))
eigen_fig.add_trace(go.Histogram(x=x3, name='approx_noPD', nbinsx=50))

eigen_fig.update_layout(barmode='overlay')
eigen_fig.update_traces(opacity=0.55)

eigen_fig.update_layout(
    title="Comparison of eigenvalues produced by different correlation methods.",
    xaxis_title="Eigenvalues",
    yaxis_title="Count",
    legend_title="Legend Title",
    barmode='overlay',
    width=800,
    height=500
)

eigen_fig.show()
# eigen_fig.write_image("eigen_fig.png")