In [1]:
import os
import glob
import numpy as np
import pandas as pd
import mofax as mfx

from biom import load_table, Table
from gemelli.rpca import rpca_table_processing
from tqdm.notebook import tqdm

from skbio import DistanceMatrix, OrdinationResults
from skbio.stats.composition import closure, clr
from skbio.stats.distance import permanova

from scipy.spatial import distance
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (average_precision_score, auc, roc_auc_score,
                             balanced_accuracy_score, f1_score)
from sklearn import metrics 


## Import all metric results from file 

In [2]:
max_iterations = 5
metadata = pd.read_csv('../../data/simulations/ihmp/sample-metadata-plus-train-tests-last.csv', index_col=0)
display(metadata.head(5))

Unnamed: 0_level_0,site_sub_coll,data_type,week_num,date_of_receipt,interval_days,visit_num,Research Project,PDO Number,GSSR IDs,Product,...,train_test_1,train_test_2,train_test_3,train_test_4,train_test_5,train_test_6,train_test_7,train_test_8,train_test_9,train_test_10
External ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CSM67UA2,C3001C15,metabolomics,28.0,2014-09-30,13.0,20,ibdmdb,,,,...,train,train,train,train,train,train,train,train,train,train
CSM67UBN,C3002C17,metabolomics,32.0,2014-12-09,14.0,22,ibdmdb,,,,...,train,train,test,train,train,train,train,train,train,test
CSM67UBZ,C3003C14,metabolomics,26.0,2014-11-06,15.0,19,ibdmdb,,,,...,train,train,train,test,test,train,train,test,test,train
CSM79HLA,C3004C19,metabolomics,36.0,2015-01-20,14.0,25,ibdmdb,,,,...,train,test,train,train,test,train,train,train,train,train
CSM67UEI,C3005C19,metabolomics,36.0,2015-01-29,16.0,25,ibdmdb,,,,...,train,train,train,train,train,test,train,train,train,test


In [5]:
ord_res_mofa = {}

for k in tqdm(glob.glob('../../results/simulations/ihmp/mofa/*factors.model.*')):
    # save ordination
    df_ = pd.read_csv(k, index_col=0)
    df_ = pd.pivot_table(columns='factor', values='value', index='sample', data=df_)
    # add to metadata for ML
    df_ = df_[['Factor1','Factor2','Factor3']]
    df_.columns = ['PC1','PC2','PC3']
    # save to dict
    f = int(k.split('/')[-1].split('.')[0])
    if k.split('/')[-1].split('.')[3] == '':
        k2 = 'meta_g_taxonomic_profiles'
    else:
        k2 = 'meta_g_taxonomic_profiles_' + k.split('/')[-1].split('.')[3]
    if k2 == 'meta_g_taxonomic_profiles':
        model = mfx.mofa_model("../../results/simulations/ihmp/mofa/%i.model..hdf5TRUE" % (f))
    else:
        model = mfx.mofa_model("../../results/simulations/ihmp/mofa/%i.model.%s.hdf5TRUE" % (f, k2.split('_')[-1]))
    # project new data
    test_data_project = pd.read_csv("../../data/simulations/ihmp/mofa-reformatted/fold-%i-subset-test-%s-test.tsv.gz" % (f, k2), sep='\t', compression='gzip')
    # MOFAx projects on a single view, they vary greatly so we project on each one to be fair.
    for view_use in model.views:  
        test_data_project_view = test_data_project[test_data_project.view == view_use]
        test_data_project_view_X = pd.pivot_table(columns='sample',
                                                  index='feature',
                                                  values='value', data=test_data_project_view)
        # ensure projection data is ordered the same as the input data
        test_data_project_view_X = test_data_project_view_X.loc[[int(x) for x in model.features[view_use]], :]
        # project
        new_values = np.stack([model.project_data(test_data_project_view_X.values.T, view=view_use, factors=i) for i in range(3)]).T
        projected_factors = pd.DataFrame(new_values, test_data_project_view_X.columns, [['PC1','PC2','PC3']])
        projected_factors.columns = ['PC1','PC2','PC3']
        df_projected_ = pd.concat([df_, projected_factors], axis=0)
        #df_projected_.to_csv("../../results/simulations/ihmp/mofa/fold-%i-%s-projected-results-all.csv" % (f, view_use))
        # get distance from ordination
        Udist = distance.cdist(df_projected_, df_projected_)
        U_dist_res = DistanceMatrix(Udist, ids=df_projected_.index)
        # add metdata and save
        df_projected_ = pd.concat([df_projected_, metadata], axis=1)
        ord_res_mofa[(f, k2, view_use)] = df_projected_
    # close model now that we are done
    model.close()


  0%|          | 0/60 [00:00<?, ?it/s]

In [6]:
ord_res_mixomics = {}
dist_res_mixomics = {}

for k in glob.glob('../../results/simulations/ihmp/mixomics/*factors.model.*'):
    # save ordination
    df_ = pd.read_csv(k, index_col=0)
    k = k.replace('_','')
    if '..' in k.split('/')[-1]:
        k2 = 'meta_g_taxonomic_profiles'
    else:
        k2 = 'meta_g_taxonomic_profiles_' + k.split('/')[-1].split('.')[3]
    relabel = {'metabolomics':'HMP2_metabolomics',
              'metatranscriptomics':'meta_t_ecs',
              'viromes':'virome_virmap_analysis',
              'metagenomics':k2}
    f = int(k.split('/')[-1].split('.')[0])
    for view_use, view_df in df_.groupby('.id'):
        view_df = view_df.set_index('subject')[['comp1','comp2','comp3']]
        view_df.columns = ['PC1','PC2','PC3']
        # get distance from ordination
        Udist = distance.cdist(view_df, view_df)
        U_dist_res = DistanceMatrix(Udist, ids=view_df.index)
        view_df = pd.concat([view_df, metadata], axis=1)
        ord_res_mixomics[(f, relabel[view_use])] = view_df
        dist_res_mixomics[(f, relabel[view_use])] = U_dist_res


In [7]:
ord_res_icluster = {}
dist_res_icluster = {}

for k in glob.glob('../../results/simulations/ihmp/icluster/*factors.model.*'):
    # save ordination
    view_df = pd.read_csv(k, index_col=0)
    k = k.replace('_','')
    if '..' in k.split('/')[-1]:
        k2 = 'meta_g_taxonomic_profiles'
    else:
        k2 = 'meta_g_taxonomic_profiles_' + k.split('/')[-1].split('.')[3]
    f = int(k.split('/')[-1].split('.')[0])
    view_df.columns = ['PC1','PC2','PC3']
    # get distance from ordination
    Udist = distance.cdist(view_df, view_df)
    U_dist_res = DistanceMatrix(Udist, ids=view_df.index)
    view_df = pd.concat([view_df, metadata], axis=1)
    ord_res_icluster[(f, k2)] = view_df
    dist_res_icluster[(f, k2)] = U_dist_res


In [10]:
ord_res_intNMF = {}
dist_res_intNMF = {}

for k in glob.glob('../../results/simulations/ihmp/intNMF/*factors.model.*'):
    # save ordination
    view_df = pd.read_csv(k, index_col=0)
    k = k.replace('_','')
    if '..' in k.split('/')[-1]:
        k2 = 'meta_g_taxonomic_profiles'
    else:
        k2 = 'meta_g_taxonomic_profiles_' + k.split('/')[-1].split('.')[3]
    f = int(k.split('/')[-1].split('.')[0])
    view_df.columns = ['PC1','PC2','PC3']
    # get distance from ordination
    Udist = distance.cdist(view_df, view_df)
    U_dist_res = DistanceMatrix(Udist, ids=view_df.index)
    view_df = pd.concat([view_df, metadata], axis=1)
    ord_res_intNMF[(f, k2)] = view_df
    dist_res_intNMF[(f, k2)] = U_dist_res

In [11]:
max_iterations = 5

ord_all_res_joint = {}
dist_all_res_joint = {}
for k in tqdm(glob.glob('../../results/simulations/ihmp/joint-rpca-results/*')):
    #print('dist-test-%iiter.txt' % max_iterations)
    if 'dist-test-%iiter-last.txt' % max_iterations in k:
        dist_all_res_joint[(int(k.split('/')[-1].split('-')[0]),
                            k.split('/')[-1].split('-')[1])] = DistanceMatrix.read(k)
    if 'ord-test-%iiter-last.txt' % max_iterations in k:
        df_ = OrdinationResults.read(k).samples
        df_.columns = ['PC1','PC2','PC3']
        df_ = pd.concat([df_, metadata], axis=1)
        ord_all_res_joint[(int(k.split('/')[-1].split('-')[0]),
                           k.split('/')[-1].split('-')[1])] = df_
            

  0%|          | 0/140 [00:00<?, ?it/s]

In [12]:
ord_res = {}
dist_res = {}
for k in glob.glob('../../results/simulations/ihmp/rpca/*'):
    if 'dist' in k:
        dist_res[(int(k.split('/')[-1].split('-')[0]),
                           k.split('/')[-1].split('-')[1])] = DistanceMatrix.read(k)
    if 'ord' in k:
        ord_res[(int(k.split('/')[-1].split('-')[0]),
                           k.split('/')[-1].split('-')[1])] = pd.read_csv(k, index_col=0)

        

In [13]:
ord_res_others = {}
dist_res_others = {}
for k in glob.glob('../../results/simulations/ihmp/all-others/*'):
    if 'dist' in k:
        dist_res_others[(int(k.split('/')[-1].split('-')[0]),
                           k.split('/')[-1].split('-')[1],
                           k.split('/')[-1].split('-')[2])] = DistanceMatrix.read(k)
    if 'ord' in k:
        ord_res_others[(int(k.split('/')[-1].split('-')[0]),
                           k.split('/')[-1].split('-')[1],
                           k.split('/')[-1].split('-')[2])] = pd.read_csv(k, index_col=0)



## Run ML on ordinations

In [32]:
# merge all ords. (for MOFA+ we chose the HMP2_metabolomics since it had the best seperation)
ord_res_all = {**{(k[0], k[1], 'Joint-RPCA'):v for k, v in ord_all_res_joint.items()},
               **{(k[0], k[1], 'iCluster'):v for k, v in ord_res_icluster.items()},
               **{(k[0], k[1], 'intNMF'):v for k, v in ord_res_intNMF.items()},
               **{(int(k[0]), k[1], 'mixOmics'):v for k, v in ord_res_mixomics.items()},
               **{(int(k[0]), k[1], k[2], 'MOFA'):v for k, v in ord_res_mofa.items()},
               **{(k[0], k[1], 'RPCA'):v for k, v in ord_res.items()},
              }
accuracy_score = {}
f1_score = {}
permanova_res = {}
apr_scores = {}
auc_roc_scores = {}

cols_learn = ['PC1','PC2','PC3']
covert_map = {'IBD': 1, 'nonIBD': 0}
classifier = RandomForestClassifier(n_estimators=500, 
                                    random_state=1010)

for fold in tqdm(range(10)):
    fold += 1 # (saved in the dataframe as 1-10)
    #
    # for all sparsity levels
    for dt_name in ['meta_g_taxonomic_profiles_11',
                    'meta_g_taxonomic_profiles_9',
                    'meta_g_taxonomic_profiles_7',
                    'meta_g_taxonomic_profiles_5',
                    'meta_g_taxonomic_profiles_3',
                    'meta_g_taxonomic_profiles']:
        
        # all metrics
        for metric_ in ['iCluster','intNMF',
                        'Joint-RPCA','MOFA','mixOmics']:
            # make a list to append for the dicts
            if (dt_name, metric_) not in accuracy_score.keys():
                accuracy_score[(dt_name, metric_)] = []
                f1_score[(dt_name, metric_)] = []
                permanova_res[(dt_name, metric_)] = []
                apr_scores[(dt_name, metric_)] = []
                auc_roc_scores[(dt_name, metric_)] = []
            # MOFA projects on multiple views so we will test them all
            if metric_ == 'MOFA':
                for other_tbl_ in  [dt_name, 'virome_virmap_analysis','meta_t_ecs','HMP2_metabolomics']:
                    # get the ordination data
                    tbl = ord_res_all[(fold, dt_name, other_tbl_, metric_)].copy()   
                    tbl = tbl[cols_learn].dropna(subset=cols_learn)
                    # get labels
                    metadata_train = metadata[metadata['train_test_%i' % (fold)] == 'train']
                    metadata_test = metadata[metadata['train_test_%i' % (fold)] == 'test']
                    metadata_train = metadata_train.loc[list(set(tbl.index) & set(metadata_train.index)), :]
                    metadata_test = metadata_test.loc[list(set(tbl.index) & set(metadata_test.index)), :]
                    y_train = list(metadata_train['diagnosis'].values)
                    y_train = [covert_map[i] for i in y_train]
                    y_test = list(metadata_test['diagnosis'].values)
                    y_test = [covert_map[i] for i in y_test]
                    X_train = tbl.loc[metadata_train.index, :].values
                    X_test = tbl.loc[metadata_test.index, :].values
                    # permanova on test data (projection)
                    dist_tmp = DistanceMatrix(distance.cdist(tbl, tbl), tbl.index)
                    permanova_res[(dt_name, metric_)].append(permanova(dist_tmp, metadata.loc[dist_tmp.ids, ['diagnosis']].iloc[:, 0], permutations=1)['test statistic'])
                    # ML
                    scaler = StandardScaler()
                    X_train = scaler.fit_transform(X_train)
                    X_test = scaler.transform(X_test)
                    classifier.fit(X_train, y_train)
                    y_score = classifier.predict_proba(X_test)[:, 1]
                    y_pred = classifier.predict(X_test)
                    accuracy_score[(dt_name, metric_)].append(balanced_accuracy_score(y_test, y_pred))
                    f1_score[(dt_name, metric_)].append(metrics.f1_score(y_test, y_pred, average='macro'))
                    apr_scores[(dt_name, metric_)].append(average_precision_score(y_test, y_score))
                    auc_roc_scores[(dt_name, metric_)].append(roc_auc_score(y_test, y_score))

            # all others project on a single view
            else:
                # get the ordination data
                tbl = ord_res_all[(fold, dt_name, metric_)].copy()   
                tbl = tbl[cols_learn].dropna(subset=cols_learn)
                # get labels
                metadata_train = metadata[metadata['train_test_%i' % (fold)] == 'train']
                metadata_test = metadata[metadata['train_test_%i' % (fold)] == 'test']
                metadata_train = metadata_train.loc[list(set(tbl.index) & set(metadata_train.index)), :]
                metadata_test = metadata_test.loc[list(set(tbl.index) & set(metadata_test.index)), :]
                y_train = list(metadata_train['diagnosis'].values)
                y_train = [covert_map[i] for i in y_train]
                y_test = list(metadata_test['diagnosis'].values)
                y_test = [covert_map[i] for i in y_test]
                X_train = tbl.loc[metadata_train.index, :].values
                X_test = tbl.loc[metadata_test.index, :].values
                # permanova on test data (projection)
                dist_tmp = DistanceMatrix(distance.cdist(tbl, tbl), tbl.index)
                permanova_res[(dt_name, metric_)].append(permanova(dist_tmp, metadata.loc[dist_tmp.ids, ['diagnosis']].iloc[:, 0], 
                                                                    permutations=1)['test statistic'])
                # ML
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
                classifier.fit(X_train, y_train)
                y_score = classifier.predict_proba(X_test)[:, 1]
                y_pred = classifier.predict(X_test)
                accuracy_score[(dt_name, metric_)].append(balanced_accuracy_score(y_test, y_pred))
                f1_score[(dt_name, metric_)].append(metrics.f1_score(y_test, y_pred, average='macro'))
                apr_scores[(dt_name, metric_)].append(average_precision_score(y_test, y_score))
                auc_roc_scores[(dt_name, metric_)].append(roc_auc_score(y_test, y_score)) 
                

  0%|          | 0/10 [00:00<?, ?it/s]

In [35]:
permanova_all_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in permanova_res.items() ])).T.stack().reset_index().dropna(axis=1)
permanova_all_df.columns = ['dataset','method','fold','f_stat']
permanova_all_df['method'] = permanova_all_df.method.replace('aitchison','Aitchison').replace('braycurtis','Bray-Curtis')
permanova_all_df.to_csv('../../data/simulations/benchmarks/permanova-test-%iiter-last.csv' % max_iterations)
permanova_all_df.head(5)

Unnamed: 0,dataset,method,fold,f_stat
0,meta_g_taxonomic_profiles_11,iCluster,0,1.4826
1,meta_g_taxonomic_profiles_11,iCluster,1,2.47649
2,meta_g_taxonomic_profiles_11,iCluster,2,1.928308
3,meta_g_taxonomic_profiles_11,iCluster,3,0.25862
4,meta_g_taxonomic_profiles_11,iCluster,4,1.387822


In [36]:
apr_all_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in apr_scores.items() ])).T.stack().reset_index().dropna(axis=1)
apr_all_df.columns = ['dataset','method','fold','apr']
apr_all_df['method'] = apr_all_df.method.replace('aitchison','Aitchison').replace('braycurtis','Bray-Curtis')
apr_all_df.to_csv('../../data/simulations/benchmarks/apr-test-%iiter-last.csv' % max_iterations)
apr_all_df.head(5)

Unnamed: 0,dataset,method,fold,apr
0,meta_g_taxonomic_profiles_11,iCluster,0,0.7969
1,meta_g_taxonomic_profiles_11,iCluster,1,0.789058
2,meta_g_taxonomic_profiles_11,iCluster,2,0.840364
3,meta_g_taxonomic_profiles_11,iCluster,3,0.662699
4,meta_g_taxonomic_profiles_11,iCluster,4,0.836612


In [37]:
roc_all_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in auc_roc_scores.items() ])).T.stack().reset_index().dropna(axis=1)
roc_all_df.columns = ['dataset','method','fold','roc_auc']
roc_all_df['method'] = roc_all_df.method.replace('aitchison','Aitchison').replace('braycurtis','Bray-Curtis')
roc_all_df.to_csv('../../data/simulations/benchmarks/roc-auc-test-%iiter-last.csv' % max_iterations)
roc_all_df.head(5)

Unnamed: 0,dataset,method,fold,roc_auc
0,meta_g_taxonomic_profiles_11,iCluster,0,0.533333
1,meta_g_taxonomic_profiles_11,iCluster,1,0.725
2,meta_g_taxonomic_profiles_11,iCluster,2,0.6
3,meta_g_taxonomic_profiles_11,iCluster,3,0.45
4,meta_g_taxonomic_profiles_11,iCluster,4,0.633333
