In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx

from sklearn import preprocessing
from matplotlib import pyplot as plt
from pyvis.network import Network
from matplotlib.colors import ListedColormap
from scipy.stats import pearsonr

from GGLasso.gglasso.problem import glasso_problem
from utils import transform_features, scale_array_by_diagonal, rename_index_with_sum
from utils import load_data, save_dataframe, process_taxonomy
from utils import filter_zero_features, filter_zero_samples, update_index
from utils import plotly_heatmap, plot_network, _make_heatmap, create_graph, plot_covariates
from utils import plot_ordered_heatmap, hierarchical_clustering, _get_order, create_label_dict
from utils import clean_meta_data, select_covariates, scale_meta_data, merge_data
from utils import calculate_covariance, PCA, create_lambda_mask, create_network_visualization
from utils import project_covariates, process_clust_order, concatenate_PC, scatter_plot

from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, HoverTool, LabelSet, PointDrawTool
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import RdBu, Blues8
from bokeh.models import HoverTool, Panel, Tabs, ColorBar, LinearColorMapper
from bokeh.layouts import row

# from latentcor import gen_data, get_tps, latentcor
# processed_taxa = (
#     load_data("data/taxonomy.tsv")
#     .pipe(process_taxonomy)
# )
# processed_taxa.head()

### Data preprocessing

In [None]:
load_data("data/filtered_counts.csv").T

In [None]:
filtered_counts=(
    load_data("data/atacama_counts.tsv")
    .pipe(filter_zero_features, threshold=0.8)
    .pipe(filter_zero_samples, threshold=0.95)
    #.pipe(update_index, processed_taxa)
    .pipe(save_dataframe, filename="data/org_named_counts.csv")
    .pipe(rename_index_with_sum)
    .pipe(save_dataframe, filename="data/filtered_counts.csv")
    .pipe(transform_features, transformation="mclr")
    .pipe(save_dataframe, filename="data/mclr_counts.csv")
)
print(filtered_counts.shape)
filtered_counts

In [None]:
# plot original raw data
filtered_counts = pd.read_csv("data/filtered_counts.csv", index_col=0)
fig_X = plotly_heatmap(z=filtered_counts, x=filtered_counts.columns, y=filtered_counts.index, x_label='Samples', y_label ='Taxa', width=1400, height=500)
fig_X.show()
fig_X.write_image("plots/acm_raw.pdf")
fig_X.write_image("plots/acm_raw.png")
fig_X.write_image("plots/acm_raw.svg")

# plot 'mclr-transformed data
mclr = pd.read_csv("data/mclr_counts.csv", index_col=0)
fig_mclr = plotly_heatmap(z=mclr, x=mclr.columns, y=mclr.index, x_label='Samples', y_label ='Taxa', width=1400, height=500)
fig_mclr.show()
fig_mclr.write_image("plots/acm_mclr.pdf")
fig_mclr.write_image("plots/acm_mclr.png")
fig_mclr.write_image("plots/acm_mclr.svg")

In [None]:
mclr.to_csv('data/mclr_count_table.tsv', sep='\t', index=True)

In [None]:
scaled_covaraites=(
    load_data("data/acm_meta.tsv")
    .pipe(clean_meta_data)
    .pipe(save_dataframe, filename="data/clean_metadata.csv")
    .pipe(select_covariates, ['ph', 'average-soil-relative-humidity', 'elevation', 'average-soil-temperature'])
    .pipe(save_dataframe, filename="data/unscaled_selected_covaraites.csv")
    .pipe(scale_meta_data)
    .pipe(save_dataframe, filename="data/scaled_selected_covaraites.csv")
)
print(scaled_covaraites.shape)
scaled_covaraites

In [None]:
unscaled_covariates = load_data("data/unscaled_selected_covaraites.csv")
scaled_covariates = load_data("data/scaled_selected_covaraites.csv")

plot_covariates(unscaled_covariates, scaled_covariates)

In [None]:
n_cov = scaled_covariates.shape[1]

vis_S = (
    merge_data(mclr, scaled_covariates)
    .pipe(save_dataframe, filename="data/merged_data.csv")
    .pipe(calculate_covariance, n_cov, method="corr")
    .pipe(save_dataframe, filename="data/asv_covariance.csv")
    )

vis_S_meta = (
    load_data("data/merged_data.csv")
    .pipe(calculate_covariance, n_cov=None, method="corr")
    .pipe(save_dataframe, filename="data/asv_meta_covariance.csv")
    )

vis_S.shape, vis_S_meta.shape

In [None]:
clust_order = _get_order(vis_S)

p_vis_S_clust = plot_ordered_heatmap(vis_S, order=clust_order)
p_clust_meta = plot_ordered_heatmap(vis_S_meta, order=clust_order, n_covariates=n_cov)

# show(p_vis_S_clust)
show(p_clust_meta)

In [None]:
counts = load_data("data/merged_data.csv")

vis_S = vis_S.round(10)

N = counts.shape[0]
p = counts.shape[1] - n_cov
p_meta = counts.shape[1]
print("Shape of data without covariates: {0}, {1}".format(N, p))
print("Shape of data with covariates: {0}, {1}".format(N, p_meta))

lambda1_range = np.logspace(0, -3, 50)
# mu1_range = np.logspace(-0.2, -0.5, 10)
mu1_range = np.logspace(-0.1, -0.3, 50)
gamma = 0.01

modelselect_params = {'lambda1_range': lambda1_range, 'mu1_range': mu1_range}

P_SGL = glasso_problem(vis_S.values, N, latent=False, do_scaling=False)
P_SGL.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=gamma)

P_SGL_low = glasso_problem(vis_S.values, N, latent=True, do_scaling=False)
P_SGL_low.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=gamma)

lambda_mask = create_lambda_mask(counts, p, p_meta)
modelselect_params["lambda1_mask"] = lambda_mask

P_SGL_adapt = glasso_problem(vis_S_meta.values, N, latent=False, do_scaling=False)
P_SGL_adapt.model_selection(modelselect_params=modelselect_params, method='eBIC', gamma=gamma)

In [None]:
print("SGL solution with lambda={lambda1} and mu={mu1}".format(**P_SGL.reg_params))
print("Adaptive SGL+low-rank solution with lambda={lambda1} and mu={mu1}".format(**P_SGL_adapt.reg_params))
print("SGL+low-rank solution with lambda={lambda1} and mu={mu1}".format(**P_SGL_low.reg_params))

In [None]:
width = 1500
height = 1500
label_size = "16pt"
asv_names = vis_S.columns
meta_names = vis_S_meta.columns


# for visualization reasons we transform inverse covaraince to negative inverse covaraince, i.e., multiply by -1
sgl = -3.5 * pd.DataFrame(P_SGL.solution.precision_, columns=asv_names, index=asv_names) # we scale the color by 3.5 for the example
adapt = -1 * pd.DataFrame(P_SGL_adapt.solution.precision_, columns=meta_names, index=meta_names)
low = -1 * pd.DataFrame(P_SGL_low.solution.precision_, columns=asv_names, index=asv_names)
lrp = -1 * pd.DataFrame(P_SGL_low.solution.lowrank_, columns=asv_names, index=asv_names)


p_sgl = plot_ordered_heatmap(sgl, order=clust_order, 
                     title="SGL estimated (negative) inverse covariance")
p_adapt = plot_ordered_heatmap(adapt, order=clust_order, n_covariates=n_cov, 
                     title="Adaptive estimated (negative) inverse covariance")
p_low = plot_ordered_heatmap(low, order=clust_order, 
                     title="SGL+low-rank estimated (negative) inverse covariance")

p_lrp = plot_ordered_heatmap(5*lrp, order=clust_order, title="Low-rank")

show(p_sgl)
show(p_adapt)
show(p_low)
show(p_lrp)

# a= lrp.iloc[:, 4].values

# # matrix = np.outer(a, -1*a)
# print(matrix)

In [None]:
# # Define the labels dictionary
# toy_labels = {'ASV2': 'A', 'ASV7': 'B', 'ASV8': 'C', 'ASV11': 'D'}

# # Perform renaming for each DataFrame
# dfs = [sgl, adapt, low, lrp, vis_S]

# for df in dfs:
#     # Rename the index labels
#     df.rename(index=toy_labels, inplace=True)
    
#     # Rename the column labels
#     df.rename(columns=toy_labels, inplace=True)

In [None]:
vis_S_clust = hierarchical_clustering(vis_S, clust_order)

In [None]:
### for 0.9
# test = ['g__Nitriliruptoraceae', "ASV7", 'g__wb1-P19']
#test = ["ASV8", "ASV16", 'g__Nitriliruptoraceae', "ASV20", "ASV7", "ASV12", 'g__wb1-P19', 'g__Nitrosococcus']
# test = ["ASV16", 'g__Nitriliruptoraceae', "ASV8", "ASV23", "ASV12", 'g__wb1-P19']
# test = [ "A", "B", "C", "D"]
test = [ "ASV-6", "ASV-11", "ASV-1", "ASV-5"]
#test = ['g__Nitriliruptoraceae', 'g__Gaiella', 'g__Rokubacteriales', 'g__Candidatus_Nitrososphaera', 'g__wb1-P19', 'g__Nitrosococcus']
# asv_meta = list(meta_names[-n_cov:]) + test
asv_meta = ['average-soil-temperature', 'average-soil-relative-humidity', 'elevation', 'ph'] + test

sgl_edges = sgl[sgl.columns.intersection(asv_meta)].loc[test]
adapt_edges = adapt[adapt.columns.intersection(asv_meta)].loc[asv_meta]
low_edges = low[low.columns.intersection(asv_meta)].loc[test]
org_edges = vis_S_clust[vis_S_clust.columns.intersection(asv_meta)].loc[test]

G_SGL = create_graph(sgl_edges, threshold=0.0)
G_adapt = create_graph(adapt_edges, threshold=0.0)
G_low = create_graph(low_edges, threshold=0.0)
G_org = create_graph(org_edges, threshold=0.0)

In [None]:
test = create_network_visualization(G_org, height=500, width=800, show_labels=False, size_degree=True, scale_edge=20, scale_node=17)
test.show('plots/network.html')

In [None]:
test = create_network_visualization(G_SGL, height=500, width=800, show_labels=False, size_degree=True, scale_edge=20, scale_node=17)
test.show('plots/network.html')

In [None]:
# G_low.add_node("A")
# G_low.add_node("B")

test = create_network_visualization(G_low, height=500, width=800, show_labels=False, size_degree=True, scale_edge=50, scale_node=17)
test.show('plots/network.html')

In [None]:
test = create_network_visualization(G_adapt, height=500, width=800, show_labels=False, size_degree=True, scale_edge=2, scale_node=7)
test.show('plots/network.html')

In [None]:
def project_covariates(transformed_counts=pd.DataFrame(), raw_counts = pd.DataFrame(), metadata=pd.DataFrame(), L=np.ndarray, y=str, PC=0):
    """
    Perform covariate projection and create a scatter plot using PCA results.

    Parameters:
        transformed_counts (pandas.DataFrame, optional): Transformed count data. Default is an empty DataFrame.
        raw_counts (pandas.DataFrame, optional): Raw count data. Default is an empty DataFrame.
        metadata (pandas.DataFrame, optional): Metadata associated with the samples. Default is an empty DataFrame.
        L (numpy.ndarray): Eigenvalues matrix.
        y (str): Name of the variable to plot on the y-axis.
        PC (int): Index of the principal component to plot on the x-axis. Default is 0.

    Returns:
        bokeh.layouts.row: A row layout containing the scatter plot and color bar.

    """
    r = np.linalg.matrix_rank(L)
    proj, loadings, eigv = PCA(transformed_counts, L, inverse=True)

    eigv_sum = np.sum(eigv)
    var_exp = [(value / eigv_sum) for value in sorted(eigv, reverse=True)]

    counts_sum = raw_counts.sum(axis=0)
    depth = pd.DataFrame(data=counts_sum, columns=["sequencing depth"])
    metadata = depth.join(metadata)

    pc_columns = list('PC{0} ({1}%)'.format(i+1, str(100 * var_exp[i])[:4]) for i in range(0, r))
    df_proj = pd.DataFrame(proj, columns=pc_columns, index=Z_mclr.index)
    df = df_proj.join(metadata)
    
    varName1 = 'PC{0} ({1}%)'.format(PC+1, str(100 * var_exp[PC])[:4])
    varName2 = y
    # varName2 = 'PC{0} ({1}%)'.format(PC+2, str(100 * var_exp[1])[:4])
    df['x'] = df[varName1]
    df['y'] = df[varName2]

    source = ColumnDataSource(df)

    p0 = figure(tools='save, zoom_in, zoom_out, wheel_zoom, box_zoom, reset', plot_width=800, plot_height=800,
                active_scroll="wheel_zoom",
                x_axis_label=varName1, y_axis_label=varName2,
                tooltips=[(varName1, "@" + varName1),
                          (varName2, "@" + varName2)
                          ],
                title=varName1 + " vs " + varName2)
    
    
    
    rdbu = plt.get_cmap('RdPu_r')
    cmap = ListedColormap(rdbu(np.arange(256)))
    # Create a list of hex color codes from the colormap
    colors = [cmap(i)[:3] for i in range(256)]
    colors = ['#' + ''.join([format(int(c * 255), '02x') for c in color]) for color in colors]
    colors = colors[::-1]  # red - positive, blue - negative
    exp_cmap = LinearColorMapper(palette=colors, low=depth.values.min(), high=depth.values.max())
    
    #exp_cmap = LinearColorMapper(palette=Blues8[::-1], low=min(df['sequencing depth'].values), high=max(df['sequencing depth'].values))
    p0.circle('x', 'y', source=source, size=15, line_color=None, fill_color={"field": "sequencing depth", "transform": exp_cmap}, fill_alpha=0.3)

    color_bar_plot = figure(title='sequencing depth', title_location="right",
                            height=500, width=150, toolbar_location=None, min_border=0,
                            outline_line_color=None)

    bar = ColorBar(color_mapper=exp_cmap, location=(1, 1))
    #bar = ColorBar(color_mapper=exp_cmap, location=(1, 1))

    color_bar_plot.add_layout(bar, 'right')
    color_bar_plot.title.align = "center"
    color_bar_plot.title.text_font_size = '12pt'

    layout = row(p0, color_bar_plot)

    return layout

In [None]:
L = P_SGL_low.solution.lowrank_
Z_mclr = counts.iloc[:, :-n_cov]
r = np.linalg.matrix_rank(L)
print("Low rank:", r)
proj, loadings, eigv = PCA(Z_mclr, L, inverse=True)

eigv_sum = np.sum(eigv)
var_exp = [(value / eigv_sum) for value in sorted(eigv, reverse=True)]
print("Variance explained by PCs:", var_exp)

In [None]:
pca_plot = project_covariates(transformed_counts=Z_mclr, raw_counts=filtered_counts, metadata=unscaled_covariates, L=L, y='average-soil-temperature')
show(pca_plot)

In [None]:
adapt_pc3 = concatenate_PC(solution=adapt, pc_components=-1 * loadings[:, :2], number_pc=2, index=asv_names, clust_order=clust_order)

sorted_lables_pc3, sorted_re_labels_pc3 = create_label_dict(adapt_pc3)


p_adapt_pc3 = _make_heatmap(data=adapt_pc3, labels_dict=sorted_lables_pc3, labels_dict_reversed=sorted_re_labels_pc3,
                       title="Clustered Adaptive solution with attached principal components", width=width, height=height,
                       label_size=label_size)

show(p_adapt_pc3)

In [None]:
pc_1 = pd.Series(proj[:, 0], index=Z_mclr.index, name='PC1')
pc_2 = pd.Series(proj[:, 1], index=Z_mclr.index, name='PC2')

org_temp = unscaled_covariates.loc[counts['average-soil-temperature'].index, 'average-soil-temperature']
org_elevation = unscaled_covariates.loc[counts['elevation'].index, 'elevation']

show(scatter_plot(pc_1, org_temp))
show(scatter_plot(pc_1, org_elevation, color='#FA4665'))

In [None]:
show(scatter_plot(pc_1, counts['ASV-8']))
show(scatter_plot(pc_1, counts['ASV-11']))
show(scatter_plot(counts['ASV-8'], counts['ASV-11']))

In [None]:
show(scatter_plot(counts['ASV-8'], counts['average-soil-temperature']))
show(scatter_plot(counts['ASV-8'], counts['elevation']))

In [None]:
show(scatter_plot(counts['ASV-11'], counts['average-soil-temperature']))
show(scatter_plot(counts['ASV-11'], counts['elevation']))

In [None]:
np.corrcoef(pc_1, org_temp)

In [None]:
pearsonr(pc_1, org_temp)

In [None]:
pearsonr(pc_1, org_elevation)

### Robust PCA

In [None]:
eigv_sum = np.sum(eigv)
var_exp = [(value / eigv_sum) for value in sorted(eigv, reverse=True)]

# Your data
robust_data = {
    'pc_1': pc_1.values,  # Complete the list with all values
    'pc_2': pc_2.values,  # Complete the list with all values
    'org_temp': org_temp.values # Complete the list with all values
}

plot_df = pd.DataFrame(robust_data)

# Create a scatter plot with color based on org_temp values
fig = px.scatter(plot_df, x='pc_1', y='pc_2', color='org_temp', labels={'org_temp': 'Average Soil Temperature'})

# Customize the layout if needed
fig.update_layout(title='Scatter Plot of PC1 and PC2 with Color-coded Average Soil Temperature',
                  xaxis_title='PC1 ({0}%)'.format(np.round(var_exp[0] * 100, 2)),
                  yaxis_title='PC2 ({0}%)'.format(np.round(var_exp[1] * 100, 2)),
                  width=1000, height=800,    coloraxis_colorbar=dict(
        title='Average Soil Temperature',
        tickformat='0.2f',  # Optional, format for colorbar ticks
    ),
    coloraxis_colorbar_tickfont=dict(color='black'),  # Optional, color for colorbar tick labels
    coloraxis=dict(
        colorscale=[[0, '#abe4ff'], [1, '#f1ac8b']],  # Blue to red color scale
    ))

fig.update_traces(
    marker=dict(size=17, line=dict(color='black', width=2))  # Adjust the size and width as needed
)

# Show the plot
fig.show()
fig.write_image("plots/robust_pca.svg")

# q2-classo

In [None]:
counts = load_data("data/org_named_counts.csv")
counts['Feature ID'] = counts.index
counts = rename_index_with_sum(counts)
names = pd.DataFrame("s__" + counts['Feature ID'].index, columns=['name'], index=counts['Feature ID'])

taxonomy = pd.read_csv("data/taxonomy.tsv", sep= '\t')
selected_rows = taxonomy[taxonomy['Feature ID'].isin(counts['Feature ID'])]

ranks = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
taxon = selected_rows['Taxon'].str.split(';', expand=True)
taxon.columns = ranks
taxon.index = selected_rows['Feature ID']
taxon['order'].fillna('o__', inplace=True)
taxon['family'].fillna('f__', inplace=True)
taxon['genus'].fillna('g__', inplace=True)
taxon['species'].fillna('s__', inplace=True)

taxon

taxon = taxon.assign(**names)
taxon = taxon.drop(columns=['species'])

taxon['Taxon'] = taxon.apply(lambda row: ';'.join(str(val) for val in row if val is not None), axis=1)
# taxon = taxon.drop(columns=[0,1,2,3,4,5, 'name'])
taxon['Confidence'] = selected_rows['Confidence'].values
taxon[['Taxon','Confidence']].to_csv("data/edited_taxonomy.tsv", sep='\t', index=True)

In [None]:
# !pip install biom-format

In [None]:
# !pip install --upgrade c-lasso
# !pip install c-lasso
# !pip install zarr
# !pip install plotly

!python /../../data/setup.py install
!pip install -e /../../data/.

In [None]:
!qiime dev refresh-cache

In [None]:
# !biom convert -i data/mclr_count_table.tsv -o data/mclr_count_table.biom --to-hdf5

In [None]:
!qiime tools import \
    --input-path data/mclr_count_table.biom \
    --type 'FeatureTable[Design]' \
    --input-format BIOMV210Format \
    --output-path data/mclr_count_table.qza

## Regression

In [None]:
!qiime classo add-covariates \
    --i-features data/mclr_count_table.qza \
    --m-covariates-file data/manifest_acm.tsv \
    --p-to-add ph \
    --o-new-features data/lc_xcovariates \
    --o-new-c data/lc_ccovariates \
    --o-new-w data/lc__wcovariates

In [None]:
!qiime sample-classifier split-table \
	--i-table data/mclr_count_table.qza \
	--m-metadata-file data/manifest_acm.tsv \
	--m-metadata-column average-soil-temperature  \
	--p-test-size 0.2 \
	--p-random-state 42 \
	--p-stratify False \
	--o-training-table data/regress-xtraining \
	--o-test-table data/regress-xtest

### Sparse log-contrast regression

In [None]:
!qiime classo regress  \
    --i-features data/regress-xtraining.qza \
    --m-y-file data/manifest_acm.tsv \
    --m-y-column average-soil-temperature  \
    --p-do-yshift False \
    --p-concomitant False \
    --p-huber False \
    --p-stabsel \
    --p-cv \
    --p-path \
    --p-lamfixed \
    --p-stabsel-threshold 0.7 \
    --p-cv-seed 1 \
    --p-no-cv-one-se \
    --o-result data/regresstaxa
# --m-y-column average-soil-temperature  \

In [None]:
### add covariates
# !qiime sample-classifier split-table \
# 	--i-table data/lc_xcovariates.qza \
# 	--m-metadata-file data/manifest_acm.tsv \
# 	--m-metadata-column average-soil-temperature  \
# 	--p-test-size 0.2 \
# 	--p-random-state 42 \
# 	--p-stratify False \
# 	--o-training-table data/regress-xtraining \
# 	--o-test-table data/regress-xtest

In [None]:
# !qiime classo regress  \
#     --i-features data/regress-xtraining.qza \
#     --i-c data/lc_ccovariates.qza \
#     --i-weights data/lc__wcovariates.qza \
#     --m-y-file data/manifest_acm.tsv \
#     --m-y-column average-soil-temperature  \
#     --p-concomitant \
#     --p-stabsel \
#     --p-cv \
#     --p-path \
#     --p-lamfixed \
#     --p-stabsel-threshold 0.5 \
#     --p-cv-seed 1 \
#     --p-no-cv-one-se \
#     --o-result data/regresstaxa

In [None]:
!qiime classo predict \
    --i-features data/regress-xtest.qza \
    --i-problem data/regresstaxa.qza \
    --o-predictions data/regress-predictions.qza

In [None]:
!qiime classo summarize \
  --i-problem data/regresstaxa.qza \
  --i-predictions data/regress-predictions.qza \
  --o-visualization data/regresstaxa.qzv

### trac

In [None]:
!biom convert -i data/org_mclr_count_table.tsv -o data/org_mclr_count_table.biom --to-hdf5

In [None]:
!qiime tools import \
    --input-path data/org_mclr_count_table.biom \
    --type 'FeatureTable[Design]' \
    --input-format BIOMV210Format \
    --output-path data/org_mclr_count_table.qza

In [None]:
!qiime tools import \
  --type FeatureData[Taxonomy] \
  --input-path data/edited_taxonomy.tsv \
  --output-path data/taxonomy.qza

In [None]:
!qiime metadata tabulate \
    --m-input-file data/taxonomy.qza \
    --o-visualization data/taxonomy.qzv

In [None]:
!qiime classo add-taxa \
	--i-features data/org_mclr_count_table.qza  \
	--i-taxa data/taxonomy.qza \
	--o-x data/xtaxa \
    --o-aweights data/wtaxa

In [None]:
!qiime classo add-covariates \
    --i-features data/xtaxa.qza \
    --i-weights data/wtaxa.qza \
    --m-covariates-file data/manifest_acm.tsv \
    --p-to-add ph \
    --o-new-features data/trac_xcovariates \
    --o-new-c data/trac_ccovariates \
    --o-new-w data/trac_wcovariates

In [None]:
!qiime sample-classifier split-table \
	--i-table data/trac_xcovariates.qza \
	--m-metadata-file data/manifest_acm.tsv \
	--m-metadata-column average-soil-temperature  \
	--p-test-size 0.2 \
	--p-random-state 42 \
	--p-stratify False \
	--o-training-table data/trac-xtraining \
	--o-test-table data/trac-xtest

In [None]:
!qiime classo regress  \
    --i-features data/trac-xtraining.qza \
    --i-c data/trac_ccovariates.qza \
    --i-weights data/trac_wcovariates.qza \
    --m-y-file data/manifest_acm.tsv \
    --m-y-column elevation  \
    --p-do-yshift False \
    --p-concomitant False \
    --p-huber False \
    --p-stabsel \
    --p-cv \
    --p-path \
    --p-lamfixed \
    --p-stabsel-threshold 0.7 \
    --p-cv-seed 1 \
    --p-no-cv-one-se \
    --o-result data/tractaxa

In [None]:
!qiime classo predict \
    --i-features data/trac-xtest.qza \
    --i-problem data/tractaxa.qza \
    --o-predictions data/trac-predictions.qza

In [None]:
!qiime classo summarize \
    --i-problem data/tractaxa.qza \
    --i-taxa data/taxonomy.qza \
    --i-predictions data/trac-predictions.qza \
    --o-visualization data/trac-taxa.qzv

## Classification

In [None]:
import pandas as pd

In [None]:
def accuracy(tp, fp, tn, fn):
    acc = (tp + tn) / (tp + tn + fp + fn)
    return acc

In [None]:
def recall(tp, fn):
    rec = tp / (tp + fn)
    return rec

In [None]:
def precision(tp, fp):
    prec = tp / (tp + fp)
    return prec

In [None]:
def f1_score(tp, fp, fn):
    f1 = 2*tp / (2*tp + fp + fn)
    return f1

In [None]:
cont_manifest = pd.read_csv("data/manifest_acm.tsv", sep='\t')
cat_manifest = pd.read_csv("data/acm_meta.tsv", sep='\t', index_col=0)

class_manifest = cont_manifest.merge(cat_manifest['vegetation'], on='sample-id', how='inner')
class_manifest.to_csv("data/manifest_acm_classification.tsv", sep='\t', index=False)

In [None]:
!qiime classo add-covariates \
    --i-features data/mclr_count_table.qza \
    --m-covariates-file data/manifest_acm_classification.tsv \
    --p-to-add ph \
    --o-new-features data/class_xcovariates \
    --o-new-c data/class_ccovariates \
    --o-new-w data/class_wcovariates

In [None]:
!qiime sample-classifier split-table \
	--i-table data/mclr_count_table.qza \
	--m-metadata-file data/manifest_acm_classification.tsv \
	--m-metadata-column vegetation  \
	--p-test-size 0.2 \
	--p-random-state 42 \
	--p-stratify False \
	--o-training-table data/class-xtraining \
	--o-test-table data/class-xtest

In [None]:
!qiime classo classify  \
    --i-features data/class-xtraining.qza \
    --i-c data/class_ccovariates.qza \
    --i-weights data/class_wcovariates.qza \
    --m-y-file data/manifest_acm_classification.tsv \
    --m-y-column vegetation  \
    --p-huber False \
    --p-stabsel \
    --p-cv \
    --p-path \
    --p-lamfixed \
    --p-stabsel-threshold 0.7 \
    --p-cv-seed 1 \
    --p-no-cv-one-se \
    --o-result data/classtaxa

In [None]:
!qiime classo predict \
    --i-features data/class-xtest.qza \
    --i-problem data/classtaxa.qza \
    --o-predictions data/class-predictions.qza

In [None]:
!qiime classo summarize \
  --i-problem data/classtaxa.qza \
  --i-predictions data/class-predictions.qza \
  --o-visualization data/classtaxa.qzv

In [None]:
pos = 4
neg = 6
fp = 0
fn = 1

tp = pos - fp
tn = neg - fn

In [None]:
print("Results for log-contrast model with all covariates:")
print(" Accuracy: {0}".format(accuracy(tp=tp, fp=fp, tn=tn, fn=fn)))
print(" Recall: {0}".format(recall(tp=tp, fn=fn)))
print(" Precision: {0}".format(precision(tp=tp, fp=fp)))
print(" F1-score: {0}".format(f1_score(tp=tp, fp=fp, fn=fn)))

### trac

In [None]:
!qiime classo add-covariates \
    --i-features data/xtaxa.qza \
    --i-weights data/wtaxa.qza \
    --m-covariates-file data/manifest_acm_classification.tsv \
    --p-to-add ph \
    --o-new-features data/class_trac_xcovariates \
    --o-new-c data/class_trac_ccovariates \
    --o-new-w data/class_trac_wcovariates

In [None]:
!qiime sample-classifier split-table \
	--i-table data/class_trac_xcovariates.qza \
	--m-metadata-file data/manifest_acm_classification.tsv \
	--m-metadata-column vegetation  \
	--p-test-size 0.2 \
	--p-random-state 42 \
	--p-stratify False \
	--o-training-table data/class_trac-xtraining \
	--o-test-table data/class_trac-xtest

In [None]:
!qiime classo classify  \
    --i-features data/class_trac-xtraining.qza \
    --i-c data/class_trac_ccovariates.qza \
    --i-weights data/class_trac_wcovariates.qza \
    --m-y-file data/manifest_acm_classification.tsv \
    --m-y-column vegetation  \
    --p-huber False \
    --p-stabsel \
    --p-cv \
    --p-path \
    --p-lamfixed \
    --p-stabsel-threshold 0.7 \
    --p-cv-seed 1 \
    --p-no-cv-one-se \
    --o-result data/class_trac_taxa

In [None]:
!qiime classo predict \
    --i-features data/class_trac-xtest.qza \
    --i-problem data/class_trac_taxa.qza \
    --o-predictions data/class_trac-predictions.qza

In [None]:
!qiime classo summarize \
    --i-problem data/class_trac_taxa.qza \
    --i-taxa data/taxonomy.qza \
    --i-predictions data/class_trac-predictions.qza \
    --o-visualization data/class_trac-taxa.qzv