In [3]:
import os
import glob
import numpy as np
import pandas as pd
import qiime2 as q2

from biom import Table
from skbio import DistanceMatrix, OrdinationResults
from qiime2.plugins.diversity.actions import beta, pcoa
from skbio.stats.composition import closure
from biom import load_table
from gemelli.rpca import rpca, joint_rpca, rpca_table_processing
from sklearn.model_selection import train_test_split
np.seterr(all="ignore")

In [4]:
tables = {omics_.split('/')[-1].split('.')[0]:rpca_table_processing(load_table(omics_),
                                                                    min_sample_count=0,
                                                                    min_feature_count=0,
                                                                    min_feature_frequency=0)
          for omics_ in glob.glob('../../data/case-studies/uc-severity-multiomics/Cohort_two/*.biom')}

metadata = pd.read_csv('../../data/case-studies/uc-severity-multiomics/Cohort_two/metadata_cohort_two_revised.txt', sep='\t', index_col=0)
tables

{'metaproteomics_cohort_two_matched': 108080 x 174 <class 'biom.table.Table'> with 5091405 nonzero entries (27% dense),
 'metagenomics_cohort_two_matched': 3499 x 174 <class 'biom.table.Table'> with 163422 nonzero entries (26% dense),
 'metabolomics_cohort_two_matched': 1928 x 174 <class 'biom.table.Table'> with 58524 nonzero entries (17% dense)}

In [5]:
# make table/metadata pair for each dataset of all samples in the data
tables_metdata_unshared = {}
for omics_, table_ in tables.items():
    print(omics_)
    metadata_omic = metadata.copy()
    table_omic = table_.copy()
    shared_samps = set(table_omic.ids()) & set(metadata.index)
    table_omic = table_omic.filter(shared_samps)
    table_omic = rpca_table_processing(table_omic, min_sample_count=0,
                                       min_feature_count=0,
                                       min_feature_frequency=0)
    metadata_omic = metadata_omic.reindex(shared_samps)
    table_omic_df = pd.DataFrame(table_omic.matrix_data.toarray(),
                                 table_omic.ids('observation'),
                                 table_omic.ids())
    # these omics output in % abundance so have to use relative counts.
    tables_metdata_unshared[omics_] = [table_omic, table_omic_df, metadata_omic]
    
tables_metdata_unshared.keys()


metaproteomics_cohort_two_matched
metagenomics_cohort_two_matched
metabolomics_cohort_two_matched


dict_keys(['metaproteomics_cohort_two_matched', 'metagenomics_cohort_two_matched', 'metabolomics_cohort_two_matched'])

In [6]:
shared_samps = list(set.intersection(*[set(t_.ids()) for t_ in tables.values()]) & set(metadata.index))
metadata_shared = metadata.reindex(shared_samps)
metadata_shared['Diagnosis'] = [x.replace('UC','IBD').replace('CD','IBD')
                                       for x in metadata_shared.Diagnosis]
# re-close metaG/T data
metadata_shared = metadata_shared.reindex(shared_samps)
tables_shared = {t_k:t_.copy().filter(shared_samps) for t_k, t_ in tables.items()}

"""
for t_ in ['meta_g_taxonomic_profiles','meta_t_ecs']:
    tbl_tmp = tables_shared[t_].to_dataframe().copy()
    tbl_tmp = tbl_tmp.apply(closure)
    tables_shared[t_] = Table(tbl_tmp.values, tbl_tmp.index, tbl_tmp.columns)
train_, test_ = train_test_split(metadata_shared, shuffle=True,
                                 stratify=metadata_shared['Diagnosis'],
                                 test_size=0.25)
metadata_shared['train_test'] = 'train'
metadata_shared.loc[test_.index, 'train_test'] = 'test'
metadata_shared.to_csv('../../data/case-studies/uc-severity-multiomics/Cohort_two/sample-metadata-plus-train-tests-case-study.csv')
"""

metadata_shared = pd.read_csv('../../data/case-studies/uc-severity-multiomics/Cohort_two/sample-metadata-plus-train-tests-case-study.csv', index_col=0)
metadata_shared.Diagnosis.value_counts()


Diagnosis
IBD                157
Healthy_control     16
Name: count, dtype: int64

In [7]:
from skbio.stats.composition import clr
from sklearn.preprocessing import StandardScaler

rerun = False
if rerun:
    scaler = StandardScaler(with_mean=False) # scale unit variance
    transformations = {'metaproteomics_cohort_two_matched':clr,
                       'metagenomics_cohort_two_matched':clr,
                       'metabolomics_cohort_two_matched':clr}

    tt_col = 'train_test'
    train_ = metadata_shared[metadata_shared[tt_col] == 'train'].index
    test_ = metadata_shared[metadata_shared[tt_col] == 'test'].index
    for use_sub_, lbl_out_ in zip([train_, test_], ['train','test']):
        stacked_clr_tables = {}
        for t_, tbl_ in tables_shared.items():
            df_ = pd.DataFrame(tbl_.matrix_data.toarray(),
                               tbl_.ids('observation'), 
                               tbl_.ids()).loc[:, use_sub_]
            data_ = df_.values
            data_ = data_ + 1
            data_ = clr(data_)
            #data_ = scaler.fit_transform(clr(data_))
            stacked_clr_tables[t_] = pd.DataFrame(data_, df_.index, df_.columns).stack().reset_index()
        stacked_clr_tables = pd.concat(stacked_clr_tables)
        stacked_clr_tables.columns = ["feature","sample","value"]
        stacked_clr_tables = stacked_clr_tables.reset_index().drop(['level_1'], axis=1).rename({'level_0':'view'}, axis=1)
        stacked_clr_tables = stacked_clr_tables[["sample","feature","value","view"]]
        stacked_clr_tables['feature'] = stacked_clr_tables['view'] + '_' + stacked_clr_tables['feature']
        # need to encode the feature names because MOFA does not like them
        feat_map = {v:i for i, v in enumerate(set(stacked_clr_tables.feature))}
        stacked_clr_tables['feature'] = [feat_map[v] for v in stacked_clr_tables['feature']]

        #IMPORTANT: commenting-out due to space constraints
        #stacked_clr_tables.to_csv('../../data/case-studies/uc-severity-multiomics/Cohort_two/mofa_tables/subset-%s.tsv.gz' % (lbl_out_), sep='\t', compression='gzip')


# RPCAs

In [8]:
if rerun:
    for tbl_id, tbl_ in tables_shared.items():

        for metric in ['braycurtis','aitchison']:
            # produce train/test PCoA results on distances
            tbl_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', tbl_.copy())
            q2dist_tmp = beta(tbl_q2, metric=metric).distance_matrix
            dist_tmp = q2dist_tmp.view(DistanceMatrix)
            ord_tmp = pcoa(q2dist_tmp).pcoa.view(OrdinationResults)
            #IMPORTANT: commenting-out due to space constraints
            #ord_tmp.write('../../data/case-studies/uc-severity-multiomics/Cohort_two/single-omics/%s-%s-ord.txt' % (tbl_id, metric))
            #dist_tmp.write('../../data/case-studies/uc-severity-multiomics/Cohort_two/single-omics/%s-%s-dist.txt' % (tbl_id, metric))

        metric = 'RPCA'
        ord_tmp, dist_tmp = rpca(tbl_)
        #ord_tmp.write('../../data/case-studies/uc-severity-multiomics/Cohort_two/single-omics/%s-%s-ord.txt' % (tbl_id, metric))
        #dist_tmp.write('../../data/case-studies/uc-severity-multiomics/Cohort_two/single-omics/%s-%s-dist.txt' % (tbl_id, metric))


In [9]:
# re-run with just the first train-test
# (note: multiple folds did not results in much difference in ML classfiication in benchmarks)
ord_, dist_, cv_plt = joint_rpca([t.copy() for t in tables_shared.values()],
                                 sample_metadata=metadata_shared,
                                 train_test_column='train_test',
                                 min_feature_frequency=10,
                                 min_sample_count=0,
                                 min_feature_count=0,
                                 max_iterations=3)
ord_.write('../../data/case-studies/uc-severity-multiomics/Cohort_two/case-study-joint-rpca-ord.txt')
#dist_.write('../../data/case-studies/uc-severity-multiomics/Cohort_two/case-study-joint-rpca-dist.txt')
#cv_plt.to_csv('../../data/case-studies/uc-severity-multiomics/Cohort_two/case-study-joint-rpca-cv.txt')

