In [1]:
import glob
import pandas as pd
import qiime2 as q2
import time
from qiime2.plugins.mmvec.actions import paired_omics                                   
from biom import load_table
import json
import itertools

#turn off warnings
import warnings
warnings.filterwarnings("ignore")
#turn off tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [2]:
##calculate # of features needed to match percentages
def get_n_features(p_features, total_features, subtotal_features):
    total_fts = [int(0.01 * p * total_features) for p in p_features]
    prot_fts = [fts - subtotal_features for fts in total_fts]
    return total_fts, prot_fts

In [3]:
#load the data
data_path = '../../data/simulations/ihmp/'
uc_omic_keys = [omics_.split('/')[-1].split('.')[0] for omics_ in glob.glob('{}UC/*.biom'.format(data_path))]

uc_metadata_shared = pd.read_csv('../../data/case-studies/time-ihmp-uc/uc-metadata-plus-train-tests.csv', index_col=0)
uc_tables_shared = {k_: load_table('{}UC/{}.biom'.format(data_path, k_)) for k_ in uc_omic_keys}
uc_tables_shared

{'metagenomics': 3568 x 173 <class 'biom.table.Table'> with 162126 nonzero entries (26% dense),
 'metabolomics': 1928 x 173 <class 'biom.table.Table'> with 58132 nonzero entries (17% dense),
 'metaproteomics': 108080 x 173 <class 'biom.table.Table'> with 5065165 nonzero entries (27% dense)}

In [4]:
#add number of features across tables
uc_n_feat = 0
for k, v in uc_tables_shared.items():
    uc_n_feat += v.shape[0]

uc_subtotal = uc_n_feat - uc_tables_shared['metaproteomics'].shape[0]
print(uc_n_feat, uc_subtotal)

113576 5496


In [5]:
#load mmvec (iHMP) results to get % features
mmvec_ihmp = pd.read_csv('../../data/case-studies/time-ihmp-uc/ihmp-runtime-all.csv', index_col=0)
mmvec_ihmp = mmvec_ihmp[mmvec_ihmp.method == 'MMvec']
perct = mmvec_ihmp['% features'].values
perct

array([10.46, 11.35, 13.14, 14.94, 19.41, 23.89])

In [6]:
#print the number of proteomic features needed to match the percentages
#as well as the total # of features after adding the subset of proteomics
all_factors = list(perct)
uc_total_n, uc_prot_n = get_n_features(all_factors, uc_n_feat, uc_subtotal)
print(uc_total_n)
print(uc_prot_n)

[11880, 12890, 14923, 16968, 22045, 27133]
[6384, 7394, 9427, 11472, 16549, 21637]


In [7]:
##formatting for running MMvec
#change index name to 'sample id'
uc_metadata_shared.index.name = 'sample id'

#change train to Train and test to Test
uc_metadata_shared['train_test_mmvec'] = uc_metadata_shared['train_test'].apply(lambda x: 'Train' if x == 'train' else 'Test')

In [8]:
#create list with all possible pairs of omics
omic_keys = list(uc_tables_shared.keys())
pairs = [('metagenomics', 'metagenomics'), 
         ('metabolomics', 'metabolomics'),
         ('metaproteomics', 'metaproteomics')] + list(itertools.permutations(omic_keys, 2))
pairs

[('metagenomics', 'metagenomics'),
 ('metabolomics', 'metabolomics'),
 ('metaproteomics', 'metaproteomics'),
 ('metagenomics', 'metabolomics'),
 ('metagenomics', 'metaproteomics'),
 ('metabolomics', 'metagenomics'),
 ('metabolomics', 'metaproteomics'),
 ('metaproteomics', 'metagenomics'),
 ('metaproteomics', 'metabolomics')]

In [9]:
#create list with and without proteomics
pairs_no_prot = [pair for pair in pairs if 'metaproteomics' not in pair]
pairs_prot = [pair for pair in pairs if 'metaproteomics' in pair]

In [10]:
#generate subsets of metabolomics data
proteomics_all = uc_tables_shared['metaproteomics'].copy()

proteomics_sub = {}
for factor in [0.01, 0.03, 0.04, 0.05, 0.1]:
    #get metabolomics table
    prot = uc_tables_shared['metaproteomics'].copy()
    #number of features to keep
    n_features = int(prot.shape[0]*factor)
    #take first n_features
    feature_ids = list(prot.ids(axis='observation'))[:n_features]
    #filter table
    proteomics_sub[factor] = prot.filter(feature_ids, axis='observation')
    #sanity check
    print('Proteomics: {}% of features: {}'.format(factor*100, proteomics_sub[factor].shape[0]))

Proteomics: 1.0% of features: 1080
Proteomics: 3.0% of features: 3242
Proteomics: 4.0% of features: 4323
Proteomics: 5.0% of features: 5404
Proteomics: 10.0% of features: 10808


In [11]:
uc_metadata_shared_short = uc_metadata_shared.copy()
uc_metadata_shared_short = uc_metadata_shared_short[['train_test_mmvec']]

In [20]:
# runtime = {}
# runtime_total = {}

In [None]:
# #run mmvec for each pair of omics without proteomics
# factor = 0
# time_factor = 0
# for pair in pairs_no_prot:  
#     print(pair)
#     #get tables and set to correct format
#     t1 = uc_tables_shared[pair[0]]
#     t2 = uc_tables_shared[pair[1]]
#     t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
#     t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

#     #run and time mmvec
#     time_start = time.perf_counter()
#     mmvec_res = paired_omics(t1_q2, t2_q2, 
#                             summary_interval=1,
#                             metadata=q2.Metadata(uc_metadata_shared_short),
#                             training_column='train_test_mmvec',
#                             min_feature_count=10)
#     time_elapsed = (time.perf_counter() - time_start)
#     time_factor += time_elapsed
#     print('Time elapsed: ', round(time_elapsed,4))
#     print()
#     #save output
#     runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
#     with open("../results/mmvec-runtime-uc.json", "w") as outfile: 
#         json.dump(runtime, outfile)

# runtime_total[factor] = time_factor
# print('Total time elapsed: ', round(time_factor,4))

# #save mmvec results as json/pickle
# with open("../results/mmvec-runtime-total-uc.json", "w") as outfile: 
#    json.dump(runtime_total, outfile)

In [None]:
# #sanity check
# print(runtime)
# print(runtime_total)

In [12]:
#load mmvec results
with open("../results/mmvec-runtime-total-uc.json", "r") as infile:
    runtime_total = json.load(infile)

with open("../results/mmvec-runtime-uc.json", "r") as infile:
    runtime = json.load(infile)

In [13]:
runtime

{'metagenomics-metagenomics': 1563.6010937170067,
 'metabolomics-metabolomics': 179.47512698400533,
 'metagenomics-metabolomics': 957.9216301410052,
 'metabolomics-metagenomics': 334.7416218419967,
 'metaproteomics0.01-metaproteomics0.01': 232.43496004698682,
 'metagenomics-metaproteomics0.01': 499.39272144700226,
 'metabolomics-metaproteomics0.01': 105.30647485500958,
 'metaproteomics0.01-metagenomics': 582.3444733399956,
 'metaproteomics0.01-metabolomics': 372.5092970100086,
 'metaproteomics0.03-metaproteomics0.03': 3680.409118486001,
 'metagenomics-metaproteomics0.03': 1343.1144764290075,
 'metabolomics-metaproteomics0.03': 252.66649123700336,
 'metaproteomics0.03-metagenomics': 3071.323532092996,
 'metaproteomics0.03-metabolomics': 1140.475918815995,
 'metagenomics-metaproteomics0.05': 5627.837036309997,
 'metabolomics-metaproteomics0.05': 457.6624763409782,
 'metaproteomics0.05-metabolomics': 3228.7686570439837,
 'metaproteomics0.05-metaproteomics0.05': 360000,
 'metagenomics-meta

In [None]:
for factor in [0.01, 0.03]: #0.04 onwards is too slow
    print('\n### Proteomic Features: {}% ###'.format(factor*100))
    prot_table = proteomics_sub[factor]
    table_to_use = uc_tables_shared.copy()
    table_to_use['metaproteomics'] = prot_table
    print('Proteomics n features: {}'.format(table_to_use['metaproteomics'].shape[0]))

    time_factor = 0
    for pair in pairs_prot:
        print(pair)
        
        #get tables and set to correct format
        t1 = table_to_use[pair[0]]
        t2 = table_to_use[pair[1]]
        t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
        t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

        #run and time mmvec
        time_start = time.perf_counter()
        mmvec_res = paired_omics(t1_q2, t2_q2, 
                                summary_interval=1,
                                metadata=q2.Metadata(uc_metadata_shared_short),
                                training_column='train_test_mmvec',
                                min_feature_count=10)
        time_elapsed = (time.perf_counter() - time_start)
        time_factor += time_elapsed
        print('Time elapsed: ', round(time_elapsed,2))
        print()

        # update pair name before saving
        if pair[0] == 'metaproteomics':
            pair = ('metaproteomics{}'.format(factor), pair[1])
        if pair[1] == 'metaproteomics':
            pair = (pair[0], 'metaproteomics{}'.format(factor))
        #save output
        runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
        with open("../../results/case-studies/time-ihmp-uc/mmvec-runtime-uc.json", "w") as outfile: 
            json.dump(runtime, outfile)
    
    runtime_total[factor] = time_factor
    print('Total time elapsed: {} secs ({} mins)'.format(round(time_factor, 2),
                                                         round(time_factor/60, 2)))
    
    #save mmvec results as json/pickle
    with open("../../data/case-studies/time-ihmp-uc/mmvec-runtime-total-uc.json", "w") as outfile: 
        json.dump(runtime_total, outfile)
    

In [14]:
pairs_prot_sub = [#('metaproteomics', 'metaproteomics'),
                    ('metagenomics', 'metaproteomics'),
                    ('metabolomics', 'metaproteomics'),
                    ('metaproteomics', 'metagenomics'),
                    ('metaproteomics', 'metabolomics')]

for factor in [0.04]:
    print('\n### Proteomic Features: {}% ###'.format(factor*100))
    prot_table = proteomics_sub[factor]
    table_to_use = uc_tables_shared.copy()
    table_to_use['metaproteomics'] = prot_table
    print('Proteomics n features: {}'.format(table_to_use['metaproteomics'].shape[0]))

    time_factor = 0
    for pair in pairs_prot_sub:
        print(pair)
        
        #get tables and set to correct format
        t1 = table_to_use[pair[0]]
        t2 = table_to_use[pair[1]]
        t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
        t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

        #run and time mmvec
        time_start = time.perf_counter()
        mmvec_res = paired_omics(t1_q2, t2_q2, 
                                summary_interval=1,
                                metadata=q2.Metadata(uc_metadata_shared_short),
                                training_column='train_test_mmvec',
                                min_feature_count=10)
        time_elapsed = (time.perf_counter() - time_start)
        time_factor += time_elapsed
        print('Time elapsed: ', round(time_elapsed,2))
        print()

        # update pair name before saving
        if pair[0] == 'metaproteomics':
            pair = ('metaproteomics{}'.format(factor), pair[1])
        if pair[1] == 'metaproteomics':
            pair = (pair[0], 'metaproteomics{}'.format(factor))
        #save output
        runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
        with open("../../data/case-studies/time-ihmp-uc/mmvec-runtime-uc.json", "w") as outfile: 
            json.dump(runtime, outfile)
    
    runtime_total[factor] = time_factor
    print('Total time elapsed: {} secs ({} mins)'.format(round(time_factor, 2),
                                                         round(time_factor/60, 2)))
    
    #save mmvec results as json/pickle
    with open("../../data/case-studies/time-ihmp-uc/mmvec-runtime-total-uc.json", "w") as outfile: 
        json.dump(runtime_total, outfile)
    

In [None]:
pairs_prot_sub = [#('metaproteomics', 'metaproteomics'),
                    ('metagenomics', 'metaproteomics'),
                    ('metabolomics', 'metaproteomics'),
                    #('metaproteomics', 'metagenomics'),
                    ('metaproteomics', 'metabolomics')]

for factor in [0.05]:
    print('\n### Proteomic Features: {}% ###'.format(factor*100))
    prot_table = proteomics_sub[factor]
    table_to_use = uc_tables_shared.copy()
    table_to_use['metaproteomics'] = prot_table
    print('Proteomics n features: {}'.format(table_to_use['metaproteomics'].shape[0]))

    time_factor = 0
    for pair in pairs_prot_sub:
        print(pair)
        
        #get tables and set to correct format
        t1 = table_to_use[pair[0]]
        t2 = table_to_use[pair[1]]
        t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
        t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

        #run and time mmvec
        time_start = time.perf_counter()
        mmvec_res = paired_omics(t1_q2, t2_q2, 
                                summary_interval=1,
                                metadata=q2.Metadata(uc_metadata_shared_short),
                                training_column='train_test_mmvec',
                                min_feature_count=10)
        time_elapsed = (time.perf_counter() - time_start)
        time_factor += time_elapsed
        print('Time elapsed: ', round(time_elapsed,2))
        print()

        # update pair name before saving
        if pair[0] == 'metaproteomics':
            pair = ('metaproteomics{}'.format(factor), pair[1])
        if pair[1] == 'metaproteomics':
            pair = (pair[0], 'metaproteomics{}'.format(factor))
        #save output
        runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
        with open("../../data/case-studies/time-ihmp-uc/mmvec-runtime-uc.json", "w") as outfile: 
            json.dump(runtime, outfile)
    
    runtime_total[factor] = time_factor
    print('Total time elapsed: {} secs ({} mins)'.format(round(time_factor, 2),
                                                         round(time_factor/60, 2)))
    
    #save mmvec results as json/pickle
    with open("../../data/case-studies/time-ihmp-uc/mmvec-runtime-total-uc.json", "w") as outfile: 
        json.dump(runtime_total, outfile)
    

### Add runtimes

In [16]:
#load mmvec results
with open("../../data/case-studies/time-ihmp-uc/mmvec-runtime-total-uc.json", "r") as infile:
    runtime_total = json.load(infile)

with open("../../data/case-studies/time-ihmp-uc/mmvec-runtime-uc.json", "r") as infile:
    runtime = json.load(infile)

In [None]:
## add an estimated time for runtime_total with factor = 0.5 for ('metaproteomics', 'metaproteomics')
#100hrs = 100*60*60 = 360000
#runtime['metaproteomics0.05-metaproteomics0.05'] = 360000
#runtime

In [None]:
#now, update runtime_total to include a new pair
runtime_prot_004 = {k: v for k, v in runtime.items() if 'metaproteomics0.04' in k}
sum(runtime_prot_004.values())
#need to add runtime_total[0] to this sum 
runtime_total['0.04'] += runtime_total['0']
runtime_total
#add to total runtime
#runtime_total['0.05'] = sum(runtime_prot_004.values())

In [26]:
#now, update runtime_total to include another new pair
runtime_prot = {k: v for k, v in runtime.items() if 'metaproteomics0.05' in k}
#add to total runtime
runtime_total['0.05'] = sum(runtime_prot.values())

In [None]:
#need to add runtime_total[0] to each individual runtime 
#runtime_total[0] is the time that all pairs excluding metaproteomics took to run
print('runtime_total[0]: ', runtime_total['0'])
print()
for key in ['0.05']:
    print('runtime_total[{}]: '.format(key), runtime_total[key])
    runtime_total[key] += runtime_total['0']
    print('Updated runtime_total[{}]: '.format(key), runtime_total[key])

In [None]:
#need to add runtime_total[0] to each individual runtime 
#runtime_total[0] is the time that all pairs excluding metaproteomics took to run
print('runtime_total[0]: ', runtime_total['0'])
print()
for key in ['0.01', '0.03']: #ignore 0.05 output
    print('runtime_total[{}]: '.format(key), runtime_total[key])
    runtime_total[key] += runtime_total['0']
    print('Updated runtime_total[{}]: '.format(key), runtime_total[key])

In [19]:
# save final set of mmvec results
with open("../../data/case-studies/time-ihmp-uc/mmvec-runtime-total-uc.json", "w") as outfile: 
   json.dump(runtime_total, outfile)

with open("../../data/case-studies/time-ihmp-uc/mmvec-runtime-uc.json", "w") as outfile: 
  json.dump(runtime, outfile)

In [None]:
#print time in minutes
runtime_total_minutes = {k: v/60 for k, v in runtime_total.items()}
runtime_total_minutes
#runtime_minutes = {k: v/60 for k, v in runtime.items()}