In [1]:
import pandas as pd
import glob

from skbio import OrdinationResults
from scipy.stats import kruskal
from gemelli.rpca import joint_rpca
from biom import load_table

#plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import 
metadata = pd.read_csv('../../data/case-studies/decomposer/split-matched-data/metadata.tsv', index_col=0, sep='\t')
tables = {i.split('/')[-1].split('.')[0]:load_table(i) for i in glob.glob('../../data/case-studies/decomposer/split-matched-data/*') if 'metadata' not in i}
tables

{'metabolite': 2333 x 374 <class 'biom.table.Table'> with 104825 nonzero entries (12% dense),
 '16S': 14237 x 374 <class 'biom.table.Table'> with 318875 nonzero entries (5% dense),
 'mag': 257 x 374 <class 'biom.table.Table'> with 58498 nonzero entries (60% dense),
 '18S': 5473 x 374 <class 'biom.table.Table'> with 114755 nonzero entries (5% dense),
 'gene_module': 377 x 374 <class 'biom.table.Table'> with 117093 nonzero entries (83% dense),
 'gene': 2457 x 374 <class 'biom.table.Table'> with 901890 nonzero entries (98% dense),
 'metabolomics': 2333 x 374 <class 'biom.table.Table'> with 104825 nonzero entries (12% dense)}

In [3]:
print(metadata.shape)
display(metadata.head())

(374, 16)


Unnamed: 0_level_0,season,subjects,add_0c,add_0c_group,facility,timepoint,traintest_0,traintest_1,traintest_2,traintest_3,traintest_4,traintest_5,traintest_6,traintest_7,traintest_8,traintest_9
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
soil.hip.CMU.17.10.2017.08.11.day3,summer,17-10,49.75,early,FIRS,3,train,train,train,train,train,train,train,train,train,train
soil.hip.CMU.17.10.2017.08.22.day14,summer,17-10,322.8,advanced,FIRS,14,test,test,test,test,test,test,test,test,test,test
soil.hip.CMU.17.10.2017.08.28.day20,summer,17-10,477.8,advanced,FIRS,20,test,test,test,test,test,test,test,test,test,test
soil.hip.CMU.17.10.2017.08.09.day1,summer,17-10,0.0,initial,FIRS,1,train,train,train,train,train,train,train,train,train,train
soil.hip.CMU.17.10.2017.08.23.day15,summer,17-10,348.05,advanced,FIRS,15,train,train,train,train,train,train,train,train,train,train


## independent RPCA

In [4]:
# # ONLY RUN ON FIRST ITERATION
# # check indpendent analysis
# n_PCS = 4
# fold = 0
# rpca_independent = {tblid:joint_rpca([tbl], n_components=n_PCS, max_iterations=15,
#                                      min_feature_count=10, sample_metadata=metadata,
#                                      train_test_column='traintest_%i' % fold)
#                     for tblid, tbl in tables.items()}

# cvs_ind = {(fold, datatype_):cv_tmp_dt for datatype_, (_, _, cv_tmp_dt) in rpca_independent.items()}
# cvs_ind_all = pd.concat(cvs_ind).reset_index().rename({'level_0':'fold', 'level_1':'modality'}, axis=1)
# sns.pointplot(x='iteration', y='mean_CV', hue='modality', data=cvs_ind_all)
# plt.show()

In [5]:
# # save results
# for tblid, (ord_, _, _) in rpca_independent.items():    
#    ord_.write('../../results/case-studies/decomposer/joint-rpca/{}_ord.txt'.format(tblid))

In [6]:
# load results from prior run
omic_ids = ['mag', '18S', 'gene_module', 'gene', 'metabolomics']
n_PCS = 3
rpca_independent = {omic: (OrdinationResults.read('../../data/case-studies/decomposer/joint-rpca/{}_ord.txt'.format(omic)),
                           None, ##we don't really need the distance matrix for the analysis here! 
                           None) for omic in omic_ids}

### Kruskal Wallis

In [7]:
cats_of_interest = ['facility', 'season', 'add_0c_group']
kw_df_all = []

for cat_of_interest in cats_of_interest:
    kw_df_cat = pd.DataFrame()

    for tblid, (ord_, _, _) in rpca_independent.items():    
        ord_samples = ord_.samples.rename(columns={0:"PC1", 1:"PC2", 
                                                   2:"PC3", 3:"PC4"})
        ord_plt = pd.concat([ord_samples, metadata], axis=1, sort=True)
        #drop rows with nan values
        ord_plt = ord_plt.dropna()
        kw_h = []
        kw_p = []

        df_index = []
        for i in range(n_PCS):
            grouped = ord_plt.groupby(cat_of_interest)["PC%i" % (i+1)]
            groups = [group for _, group in grouped]
            # Perform the Kruskal-Wallis test
            h, p = kruskal(*groups)
            kw_h.append(h)
            kw_p.append(p)
            df_index.append("{}_PC{}".format(tblid, i+1))

        kw_df_i = pd.DataFrame(list(zip(kw_h, kw_p)), columns=['H-statistic', 'p-value'], 
                                index=df_index)
        kw_df_cat = pd.concat([kw_df_cat, kw_df_i])

    kw_df_all.append(kw_df_cat)
kw_df_all = pd.concat(kw_df_all, axis=1, keys=cats_of_interest)

In [8]:
#save results
kw_df_all.to_csv('../../results/supp-table-7-rpca-kw.csv'.format(n_PCS))
kw_df_all

Unnamed: 0_level_0,facility,facility,season,season,add_0c_group,add_0c_group
Unnamed: 0_level_1,H-statistic,p-value,H-statistic,p-value,H-statistic,p-value
mag_PC1,219.41848,2.2588239999999997e-48,30.852523,9.130475e-07,41.911506,4.189415e-09
mag_PC2,274.409683,2.5863930000000003e-60,44.087142,1.446207e-09,17.860424,0.0004699862
mag_PC3,126.523778,3.3551410000000003e-28,21.859008,6.978906e-05,64.774044,5.606498e-14
18S_PC1,91.983107,1.0619939999999999e-20,138.466259,8.093738e-30,0.371468,0.9460715
18S_PC2,29.171047,4.630071e-07,200.545606,3.215668e-43,31.494116,6.689292e-07
18S_PC3,71.247557,3.37901e-16,60.482818,4.635422e-13,2.617841,0.4543704
gene_module_PC1,26.666131,1.620031e-06,4.905711,0.1788331,0.33392,0.9535277
gene_module_PC2,30.951118,1.901298e-07,87.573699,7.271375999999999e-19,97.900436,4.394332e-21
gene_module_PC3,186.50134,3.175035e-41,19.322564,0.000234464,7.86332,0.04892257
gene_PC1,262.262143,1.123309e-57,24.510005,1.954678e-05,20.595775,0.0001277153


# joint-rpca

In [9]:
# # ONLY RUN ON FIRST ITERATION
# cv_all_joint = {}
# fold = 0
# ord_jnt, dist_jnt, cv_jnt = joint_rpca([v for k, v in tables.items()
#                                         if k in ['mag', '18S', 'gene_module', 'gene', 'metabolomics']],
#                                         n_components=n_PCS,
#                                         max_iterations=15,
#                                         min_feature_count=10,
#                                         sample_metadata=metadata,
#                                         train_test_column='traintest_%i' % fold)
# cv_all_joint[0] = cv_jnt
# joint_rpca_results = {'joint': (ord_jnt, None, None)}

# # check CV error
# cv_all_joint_df = pd.concat(cv_all_joint).reset_index().rename({'level_0':'fold'}, axis=1)
# for f, df_ in cv_all_joint_df.groupby('fold'):
#     plt.errorbar(df_.iteration, df_['mean_CV'], yerr=df_['std_CV'], label='fold %i' % (f + 1))
# plt.legend()
# plt.show()

In [10]:
# #save results
# ord_jnt.write('../../data/case-studies/decomposer/joint-rpca/ord.txt')

In [11]:
# load results form prior run
joint_rpca_results = {'joint': (OrdinationResults.read('../../data/case-studies/decomposer/joint-rpca/ord.txt'), None, None) }

### Kruskal Wallis

In [12]:
ord_jnt = joint_rpca_results['joint'][0]
ord_joint_samps = ord_jnt.samples.copy()
ord_joint_samps.index.name = 'sample_name'
ord_joint_samps.reset_index(inplace=True)
merged_table = pd.merge(ord_joint_samps, metadata, on='sample_name')
#merged_table.head()

In [13]:
# Group the numerical data based on the categorical data
cats_of_interest = ['facility', 'season', 'add_0c_group']
kw_df_joint = []

for cat_of_interest in cats_of_interest:
    kw_joint_cat = pd.DataFrame()
    kw_h = []
    kw_p = []

    for i in range(n_PCS):
        grouped = merged_table.groupby(cat_of_interest)[i]
        #grouped = merged_table.groupby(cat_of_interest)["PC%i" % (i+1)]
        groups = [group for _, group in grouped]

        # Perform the Kruskal-Wallis test
        h, p = kruskal(*groups)
        kw_h.append(h)
        kw_p.append(p)

    kw_joint_cat_df = pd.DataFrame(list(zip(kw_h, kw_p)), 
                                   columns = ['H-statistic', 'p-value'],
                                   index=[f'PC{i+1}' for i in range(n_PCS)])
    kw_df_joint.append(kw_joint_cat_df)

kw_df_joint = pd.concat(kw_df_joint, axis=1, keys=cats_of_interest)

In [14]:
#save results
kw_df_joint.to_csv('../../results/supp-table-7-joint-rpca-kw.csv'.format(n_PCS))
kw_df_joint

Unnamed: 0_level_0,facility,facility,season,season,add_0c_group,add_0c_group
Unnamed: 0_level_1,H-statistic,p-value,H-statistic,p-value,H-statistic,p-value
PC1,14.146955,0.0008472816,218.48692,4.263785e-47,8.577836,0.03546368
PC2,63.879984,1.344738e-14,83.797975,4.70057e-18,111.845888,4.395985e-24
PC3,280.164284,1.45579e-61,6.056139,0.1089107,13.292691,0.004044559
