In [1]:
import glob
import pandas as pd
import qiime2 as q2
import mofax as mfx

from biom import Table, load_table
from qiime2.plugins.diversity.actions import beta, pcoa
from gemelli.rpca import rpca, joint_rpca, rpca_table_processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from skbio.stats.composition import clr, closure
from skbio import DistanceMatrix, OrdinationResults
from scipy.spatial import distance

In [2]:
# import and match the data the same as in ../../simulations-benchmarking/3.0-ihmp-benchmarks.ipynb
# we will also save each table with all the data for that table
# for exploration non-jointly based on the joint samples
tables = {omics_.split('/')[-1].split('.')[0]:rpca_table_processing(load_table(omics_),
                                                                    min_sample_count=0,
                                                                    min_feature_count=0,
                                                                    min_feature_frequency=0)
          for omics_ in glob.glob('../../data/simulations/ihmp/*.biom')
          if not any ('_' + str(d_) in omics_ for d_ in [11, 9, 7, 5, 3])}
metadata = pd.read_csv('../../data/simulations/ihmp/sample-metadata.txt', sep='\t', index_col=0)
# add pathways instead of ECs (easier to interpret)
tbl_ = pd.read_csv('../../data/simulations/ihmp/additional-data/pathabundances_3.tsv', sep='\t', index_col=0)
tbl_ = tbl_[[('UNINTEGRATED' not in x) and ('UNMAPPED' not in x) for x in tbl_.index]]
tbl_.columns = [c.replace('_pathabundance_cpm','') for c in tbl_.columns]
tbl_ = tbl_.loc[tbl_.sum(1) > 0, tbl_.sum(0) > 0]
tbl_ = Table(tbl_.values, tbl_.index, tbl_.columns)
tables['meta_t_ecs'] = tbl_.copy()

In [4]:
# make table/metadata pair for each dataset of all samples in the data
tables_metdata_unshared = {}
for omics_, table_ in tables.items():
    print(omics_)
    metadata_omic = metadata.copy()
    table_omic = table_.copy()
    shared_samps = set(table_omic.ids()) & set(metadata.index)
    table_omic = table_omic.filter(shared_samps)
    table_omic = rpca_table_processing(table_omic, min_sample_count=0,
                                       min_feature_count=0,
                                       min_feature_frequency=0)
    metadata_omic = metadata_omic.reindex(shared_samps)
    table_omic_df = pd.DataFrame(table_omic.matrix_data.toarray(),
                                 table_omic.ids('observation'),
                                 table_omic.ids())
    # these omics output in % abundance so have to use relative counts.
    # possibly try to re-scale back to counts?
    if omics_ in ['shared_meta_g_taxonomic_profiles','meta_t_ecs']:
        table_omic_df = table_omic_df.apply(closure)
    tables_metdata_unshared[omics_] = [table_omic, table_omic_df, metadata_omic]
tables_metdata_unshared.keys()


virome_virmap_analysis
meta_t_ecs
meta_g_taxonomic_profiles
shared_meta_g_taxonomic_profiles
HMP2_proteomics_ecs
HMP2_metabolomics


dict_keys(['virome_virmap_analysis', 'meta_t_ecs', 'meta_g_taxonomic_profiles', 'shared_meta_g_taxonomic_profiles', 'HMP2_proteomics_ecs', 'HMP2_metabolomics'])

In [5]:
"""
#only needs to be run once
shared_samps = set.intersection(*[set(t_.ids()) for t_ in tables.values()]) & set(metadata.index)
metadata_shared = metadata.reindex(shared_samps)
metadata_shared['diagnosis_binned'] = [x.replace('UC','IBD').replace('CD','IBD')
                                       for x in metadata_shared.diagnosis]
# re-close metaG/T data
metadata_shared = metadata_shared.reindex(shared_samps)
tables_shared = {t_k:t_.copy().filter(shared_samps) for t_k, t_ in tables.items()}
for t_ in ['shared_meta_g_taxonomic_profiles','meta_t_ecs']:
    tbl_tmp = tables_shared[t_].to_dataframe().copy()
    tbl_tmp = tbl_tmp.apply(closure)
    tables_shared[t_] = Table(tbl_tmp.values, tbl_tmp.index, tbl_tmp.columns)

train_, test_ = train_test_split(metadata_shared, shuffle=True,
                                 stratify=metadata_shared['diagnosis'],
                                 test_size=0.25)
metadata_shared['train_test'] = 'train'
metadata_shared.loc[test_.index, 'train_test'] = 'test'
metadata_shared.to_csv('../../data/simulations/ihmp/sample-metadata-plus-train-tests-case-study.csv')
"""

shared_samps = set.intersection(*[set(t_.ids()) for t_ in tables.values()]) & set(metadata.index)
metadata_shared = metadata.reindex(shared_samps)
metadata_shared['diagnosis_binned'] = [x.replace('UC','IBD').replace('CD','IBD')
                                       for x in metadata_shared.diagnosis]
# re-close metaG/T data
metadata_shared = metadata_shared.reindex(shared_samps)
tables_shared = {t_k:t_.copy().filter(shared_samps) for t_k, t_ in tables.items()}
for t_ in ['meta_g_taxonomic_profiles','meta_t_ecs']:
    tbl_tmp = pd.DataFrame(tables_shared[t_].matrix_data.toarray(), 
                 tables_shared[t_].ids('observation'),
                 tables_shared[t_].ids())
    tbl_tmp = tbl_tmp.apply(closure)
    tables_shared[t_] = Table(tbl_tmp.values, tbl_tmp.index, tbl_tmp.columns)
metadata_shared = pd.read_csv('../../data/simulations/ihmp/sample-metadata-plus-train-tests-case-study.csv', index_col=0)

In [11]:
transformations = {'HMP2_metabolomics':clr,
                   'HMP2_proteomics_ecs':clr,
                   'shared_meta_g_taxonomic_profiles':clr,
                   'meta_t_ecs':clr,
                   'virome_virmap_analysis':clr}

tt_col = 'train_test'
train_ = metadata_shared[metadata_shared[tt_col] == 'train'].index
test_ = metadata_shared[metadata_shared[tt_col] == 'test'].index
for use_sub_, lbl_out_ in zip([train_, test_], ['train','test']):
    stacked_clr_tables = {}
    for t_, tbl_ in tables_shared.items():
        df_ = pd.DataFrame(tbl_.matrix_data.toarray(),
                           tbl_.ids('observation'), 
                           tbl_.ids()).loc[:, use_sub_]
        data_ = df_.values
        data_ = data_ + 1
        data_ = clr(data_)
        stacked_clr_tables[t_] = pd.DataFrame(data_, df_.index, df_.columns).stack().reset_index()
    stacked_clr_tables = pd.concat(stacked_clr_tables)
    stacked_clr_tables.columns = ["feature","sample","value"]
    stacked_clr_tables = stacked_clr_tables.reset_index().drop(['level_1'], axis=1).rename({'level_0':'view'}, axis=1)
    stacked_clr_tables = stacked_clr_tables[["sample","feature","value","view"]]
    stacked_clr_tables['feature'] = stacked_clr_tables['view'] + '_' + stacked_clr_tables['feature']
    # need to encode the feature names because MOFA does not like them
    feat_map = {v:i for i, v in enumerate(set(stacked_clr_tables.feature))}
    stacked_clr_tables['feature'] = [feat_map[v] for v in stacked_clr_tables['feature']]
    #stacked_clr_tables.to_csv('../../data/case-studies/ihmp/mofa-tables/subset-%s.tsv.gz' % (lbl_out_), sep='\t', compression='gzip')
    

In [None]:
# RCLR INSTEAD OF CLR -- no train/test split
# for testing MOFA+ with RCLR-transformed data
'''
from gemelli.preprocessing import rclr_transformation

#scaler = StandardScaler(with_mean=False) # scale unit variance
transformations = {'HMP2_metabolomics':rclr_transformation,
                   'HMP2_proteomics_ecs':rclr_transformation,
                   'meta_g_taxonomic_profiles':rclr_transformation,
                   'meta_t_ecs':rclr_transformation,
                   'virome_virmap_analysis':rclr_transformation}

stacked_rclr_tables = {}

for t_, tbl_ in tables_shared.items():
    tbl_rclr = rclr_transformation(tbl_)
    df_ = pd.DataFrame(tbl_rclr.matrix_data.toarray(),
                        tbl_rclr.ids('observation'), 
                        tbl_rclr.ids())
    stacked_rclr_tables[t_] = df_.stack(dropna=False).reset_index()

stacked_rclr_tables = pd.concat(stacked_rclr_tables)
stacked_rclr_tables.columns = ["feature","sample","value"]
stacked_rclr_tables = stacked_rclr_tables.reset_index().drop(['level_1'], axis=1).rename({'level_0':'view'}, axis=1)
stacked_rclr_tables = stacked_rclr_tables[["sample","feature","value","view"]]
stacked_rclr_tables['feature'] = stacked_rclr_tables['view'] + '_' + stacked_rclr_tables['feature']

# need to encode the feature names because MOFA does not like them
feat_map = {v:i for i, v in enumerate(set(stacked_rclr_tables.feature))}
stacked_rclr_tables['feature'] = [feat_map[v] for v in stacked_rclr_tables['feature']]
stacked_rclr_tables.to_csv('../../results/case-studies/ihmp/mofa-tables/rclr-all-data.tsv.gz', sep='\t', compression='gzip')
#print(stacked_rclr_tables.shape)
'''

### run 1.01-ihmp-mofa-clr.R to get mofa results on full data
### run 1.02-ihmp-mofa-rclr.R to test mofa with rclr-transformed data

### All others

In [6]:
for tbl_id, tbl_ in tables_shared.items():
    
    for metric in ['braycurtis','aitchison']:
        # produce train/test PCoA results on distances
        tbl_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', tbl_.copy())
        q2dist_tmp = beta(tbl_q2, metric=metric).distance_matrix
        dist_tmp = q2dist_tmp.view(DistanceMatrix)
        ord_tmp = pcoa(q2dist_tmp).pcoa.view(OrdinationResults)
        ord_tmp.write('../../data/case-studies/ihmp/single-omics/%s-%s-ord.txt' % (tbl_id, metric))
        dist_tmp.write('../../data/case-studies/ihmp/single-omics/%s-%s-dist.txt' % (tbl_id, metric))
     
    metric = 'RPCA'
    ord_tmp, dist_tmp = rpca(tbl_)
    ord_tmp.write('../../data/case-studies/ihmp/single-omics/%s-%s-ord.txt' % (tbl_id, metric))
    dist_tmp.write('../../data/case-studies/ihmp/single-omics/%s-%s-dist.txt' % (tbl_id, metric))


In [None]:
# run Joint-RPCA on full data (no splits)
# (note: multiple folds did not results in much difference in ML classification in benchmarks)
ord_, dist_, cv_plt = joint_rpca([t.copy() for t in tables_shared.values()],
                                 min_feature_frequency=0,
                                 min_sample_count=0,
                                 min_feature_count=0,
                                 n_test_samples=0,
                                 max_iterations=3)
ord_.write('../../data/case-studies/ihmp/joint-rpca/ord.txt')
dist_.write('../../data/case-studies/ihmp/joint-rpca/dist.txt')
cv_plt.to_csv('../../data/case-studies/ihmp/joint-rpca/cv.txt')