In [1]:
import pandas as pd
import numpy as np
from gemelli.preprocessing import rclr_transformation
from scipy.spatial import distance
from scipy.linalg import svd
from biom import Table, load_table
from scipy.sparse.linalg import svds
from skbio import OrdinationResults, DistanceMatrix
import qiime2 as q2
from qiime2.plugins.emperor.actions import biplot
from biom.util import biom_open
import glob
from skbio.stats.distance import permanova
from gemelli.rpca import joint_rpca, feature_correlation_table, rpca

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# import data
metadata = pd.read_csv('../../data/case-studies/mammalian/matched-data/metadata.tsv', index_col=0, sep='\t')
tables = {f.split('/')[-1].split('.')[0]:load_table(f) for f in glob.glob('../../data/case-studies/mammalian/matched-data/*.biom')}
metadata.head(5)

Unnamed: 0_level_0,diet,digestive_strategy,HostSpecies,HostSubSpecies,KT_lineage,SpeciesCategories,Taxonomy,diet_pri,Collected_by,Day,...,traintest_0,traintest_1,traintest_2,traintest_3,traintest_4,traintest_5,traintest_6,traintest_7,traintest_8,traintest_9
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Chimpanzee_4,Omnivores,Simplegut,Chimp,Chimp,Primates,Chimps,Pan_troglodytes,Primates,Ishay,17.0,...,train,test,train,test,train,train,test,train,train,train
Af_elephant_1,Herbivores,Hindgut,Elephant,African_elephant,Afrotheria,Elephants,Loxodonta_africana,Herbivores,Stav+Ruthie,20.0,...,train,test,train,train,train,train,test,train,train,train
Sheep_5,Herbivores,Foregut,Sheep,Sheep,Cetartiodactyla,Ruminants,Ovis_aries,Herbivores,Ruthie,11.0,...,train,train,train,train,test,train,train,train,train,train
Chimpanzee_1,Omnivores,Simplegut,Chimp,Chimp,Primates,Chimps,Pan_troglodytes,Primates,Ishay,27.0,...,train,test,train,train,train,train,train,train,train,train
As_elephant_3,Herbivores,Hindgut,Elephant,Asian_elephant,Afrotheria,Elephants,Elephas_maximus,Herbivores,Stav,23.0,...,train,train,train,train,train,train,train,train,train,train


In [4]:
tables

{'microbiome': 8153 x 101 <class 'biom.table.Table'> with 23551 nonzero entries (2% dense),
 'gcms': 411 x 101 <class 'biom.table.Table'> with 37297 nonzero entries (89% dense),
 'lipid_polar': 19855 x 101 <class 'biom.table.Table'> with 259284 nonzero entries (12% dense),
 'lipid_neg': 8794 x 101 <class 'biom.table.Table'> with 279928 nonzero entries (31% dense),
 'lipid_pos': 8036 x 101 <class 'biom.table.Table'> with 241363 nonzero entries (29% dense)}

In [3]:
tables.keys()

dict_keys(['microbiome', 'gcms', 'lipid_polar', 'lipid_neg', 'lipid_pos'])

In [None]:
cv_all_joint = {}
for fold in range(10):
    # run joint-RPCA
    ord_tmp, dist_tmp, cv_tmp = joint_rpca(list([v.copy() for dt, v in tables.items()]),
                                                     #n_components=4,
                                                     #max_iterations=20,
                                                     min_sample_count=0,
                                                     min_feature_count=0,
                                                     min_feature_frequency=0,
                                                     sample_metadata=metadata,
                                                     train_test_column='traintest_%i' % fold)
    ord_tmp.write('../../results/case-studies/mammalian/joint-rpca/%i-ordination.txt' % fold)
    dist_tmp.write('../../results/case-studies/mammalian/joint-rpca/%i-distance.txt'% fold)
    cv_tmp.to_csv('../../results/case-studies/mammalian/joint-rpca/%i-cross_validation_error.csv'% fold)
    cv_all_joint[fold] = cv_tmp


In [None]:
# run CV-RPCA on each dataset independently
cvs_ind = {}

for fold in range(10):
    for datatype_, table_tmp in tables.items():
        ord_tmp_dt, dist_tmp_dt, cv_tmp_dt = joint_rpca([table_tmp],
                                                        n_components=3,
                                                        max_iterations=3,
                                                        min_sample_count=0,
                                                        min_feature_count=0,
                                                        min_feature_frequency=0,
                                                        sample_metadata=metadata,
                                                        train_test_column='traintest_%i' % fold)
        ord_tmp_dt.write('../../results/case-studies/mammalian/rpca-independent/%s-%i-ordination.txt' % (datatype_, fold))
        dist_tmp_dt.write('../../results/case-studies/mammalian/rpca-independent/%s-%i-distance.txt' % (datatype_, fold))
        cv_tmp_dt.to_csv('../../results/case-studies/mammalian/rpca-independent/%s-%i-crossv_validation_error.csv' % (datatype_, fold))
        cvs_ind[(fold, datatype_)] = cv_tmp_dt
