In [1]:
import pandas as pd
from gemelli.preprocessing import rclr_transformation
from scipy.spatial import distance
from scipy.linalg import svd
from biom import Table
from scipy.sparse.linalg import svds
from skbio import OrdinationResults, DistanceMatrix
import qiime2 as q2
from qiime2.plugins.emperor.actions import biplot
from biom.util import biom_open
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
metadata = pd.read_csv('../../data/case-studies/mammalian/metadata.txt', index_col=0, sep='\t')
metadata_noqc = metadata[~metadata.SpeciesCategories.isin(['QC','Blank'])]
microbiome = pd.read_csv('../../data/case-studies/mammalian/16S-ASV-table.csv', index_col=0).T
lipids_neg = pd.read_csv('../../data/case-studies/mammalian/2018-12-18-lipids-neg_quant.txt', index_col=0, sep='\t').drop(['row m/z','row retention time'], axis=1)
lipids_pos = pd.read_csv('../../data/case-studies/mammalian/2018-12-18-lipids-pos_quant.txt', index_col=0, sep='\t').drop(['row m/z','row retention time'], axis=1)
lipids_polar = pd.read_csv('../../data/case-studies/mammalian/2019-07-04-polar-filter-singletons_quant.txt', index_col=0, sep='\t').drop(['row m/z','row retention time'], axis=1)
# gcms metabolomics filter
gcms_metabolomics = pd.read_csv('../../data/case-studies/mammalian/gcms-quantification.csv', index_col=0).drop(['row m/z','row retention time'], axis=1)
gcms_bscore = pd.read_csv('../../data/case-studies/mammalian/gcms-clustersummary.tsv', index_col=0, sep='\t')
gcms_bscore_filtered = gcms_bscore.loc[gcms_bscore['Balance Score'].astype(float) >= 50, :]
gcms_metabolomics = gcms_metabolomics.loc[gcms_bscore_filtered.index, :]


def inverse_mapping(f):
    return f.__class__(map(reversed, f.items()))

gcms_map = inverse_mapping(metadata['File_GCMS'].to_dict())
lippos_map = inverse_mapping(metadata['File_lipids_pos'].to_dict())
lipneg_map = inverse_mapping(metadata['File_lipids_neg'].to_dict())
polar_map = inverse_mapping(metadata['File_polar.1'].to_dict())

gcms_metabolomics.columns = [gcms_map[c.split(' ')[0]] for c in gcms_metabolomics.columns]
lipids_pos.columns = [lippos_map[c.split(' ')[0]] for c in lipids_pos.columns]
lipids_neg.columns = [lipneg_map[c.split(' ')[0]] for c in lipids_neg.columns]
lipids_polar.columns = [polar_map[c.split(' ')[0].split('.')[0]] for c in lipids_polar.columns]

all_sample_lists = [gcms_metabolomics.columns, lipids_pos.columns, lipids_neg.columns, lipids_polar.columns, microbiome.columns]
shared_samples_all_modalities = list(set.intersection(*map(set, all_sample_lists)))
print('number shared samples: %i' % len(shared_samples_all_modalities))
microbiome = microbiome[shared_samples_all_modalities]
lipids_neg = lipids_neg[shared_samples_all_modalities]
lipids_pos = lipids_pos[shared_samples_all_modalities]
lipids_polar = lipids_polar[shared_samples_all_modalities]
gcms_metabolomics = gcms_metabolomics[shared_samples_all_modalities]
metadata_noqc_matched = metadata_noqc.loc[shared_samples_all_modalities, :]
metadata_noqc_matched['lineage_and_digestive_strategy'] = metadata_noqc_matched['KT_lineage'] + '-' + metadata_noqc_matched['digestive_strategy']

metadata_noqc_matched.head(5)


number shared samples: 101


Unnamed: 0_level_0,diet,digestive_strategy,HostSpecies,HostSubSpecies,KT_lineage,SpeciesCategories,Taxonomy,diet_pri,Collected_by,Day,...,File_lipids_pos,Lipids_order,File_VFA,VFA_order,File_GCMS,GCMS_order,File_polar.1,Sex,collection_time,lineage_and_digestive_strategy
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Chimpanzee_11,Omnivores,Simplegut,Chimp,Chimp,Primates,Chimps,Pan_troglodytes,Primates,Ruthie,18.0,...,lipids-230718-pos-69.mzXML,103.0,12_26_2018-inj-101,101.0,2019-01-21-inj-142.mzML,142.0,Sample-X-76,F,0-2hr,Primates-Simplegut
Rhino_6,Herbivores,Hindgut,Rhino,Rhino,Perissodactyla,Rhinos,Ceratotherium_simum,Herbivores,Stav,6.0,...,lipids-230718-pos-77.mzXML,39.0,12_26_2018-inj-38,38.0,2019-01-28-inj-210.mzML,210.0,Sample-2A6,M,0-2hr,Perissodactyla-Hindgut
Gorilla_5,Herbivores,Simplegut,Gorilla,Gorilla,Primates,Gorillas,Gorilla_gorilla,Primates,Ruthie,25.0,...,lipids-230718-pos-52.mzXML,119.0,12_27_2018-inj-110,110.0,2019-01-16-inj-85.mzML,85.0,Sample-2B1,F,0-2hr,Primates-Simplegut
Sheep_1,Herbivores,Foregut,Sheep,Sheep,Cetartiodactyla,Ruminants,Ovis_aries,Herbivores,Stav+Elie,11.0,...,lipids-230718-pos-1.mzXML,7.0,12_25_2018-inj-6,6.0,2019-01-13-inj-6.mzML,6.0,Sample-1A4,,0-2hr,Cetartiodactyla-Foregut
Rhino_10,Herbivores,Hindgut,Rhino,Rhino,Perissodactyla,Rhinos,Ceratotherium_simum,Herbivores,Stav+Ishay,13.0,...,lipids-230718-pos-81.mzXML,14.0,12_25_2018-inj-14,14.0,2019-01-13-inj-14.mzML,14.0,Sample-1E3,F,0-2hr,Perissodactyla-Hindgut


In [3]:
metadata_noqc_matched.diet.value_counts()

Herbivores    50
Omnivores     38
Carnivores    13
Name: diet, dtype: int64

In [6]:
metadata_noqc_matched.KT_lineage.value_counts()

Primates           30
Perissodactyla     30
Carnivora          27
Afrotheria          8
Cetartiodactyla     6
Name: KT_lineage, dtype: int64

In [5]:
metadata_noqc_matched.HostSpecies.value_counts()

Zebra             16
Rhino             13
Chimp             10
Mandrill           8
Elephant           8
Coati              7
Bear               7
Gorilla            6
Sheep              5
Lion               3
CarnivorousCat     3
Gibbon             3
Wolf               2
Tiger              2
Leopard            2
Lemur              2
Goat               1
Donkey             1
Hyena              1
Capuchin           1
Name: HostSpecies, dtype: int64

In [4]:
metadata_noqc_matched.lineage_and_digestive_strategy.value_counts()

Primates-Simplegut         30
Perissodactyla-Hindgut     30
Carnivora-Simplegut        27
Afrotheria-Hindgut          8
Cetartiodactyla-Foregut     6
Name: lineage_and_digestive_strategy, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
np.random.seed(42)

for i_ in range(10):
    train, test = train_test_split(metadata_noqc_matched, test_size=0.25, shuffle=True,
                                   stratify=metadata_noqc_matched[['lineage_and_digestive_strategy']])
    metadata_noqc_matched.loc[:, 'traintest_%i' % i_] = 'train'
    metadata_noqc_matched.loc[test.index, 'traintest_%i' % i_] = 'test'
metadata_noqc_matched.to_csv('../../data/case-studies/mammalian/matched-data/metadata.tsv', sep='\t')

file_save_map = {'gcms':gcms_metabolomics,
                 'lipid_pos':lipids_pos,
                 'lipid_neg':lipids_neg,
                 'lipid_polar':lipids_polar,
                 'microbiome':microbiome}
for name_, table_out in file_save_map.items():
    # save tables
    table_out.index = name_ + '_feature_' + table_out.index.astype(str)
    table_out = Table(table_out.values, table_out.index, table_out.columns)
    with biom_open('../data/matched-data/%s.biom' % (name_), 'w') as f:
        table_out.to_hdf5(f, "filtered-table-cm")
