In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_selection import VarianceThreshold
import tensorflow as tf
print("TensorFlow version:")
print(tf.__version__)
print("Check if the GPU is recognized by TensorFlow:")
print(tf.test.is_gpu_available())

TensorFlow version:
1.15.0
Check if the GPU is recognized by TensorFlow:
False


# Metabolomics and microbiome datasets

## Metabolomics dataset

The dataset is on ratio scale. Any missing values will be replaced with one.

In [2]:
filename_metabolites = 'metabolite_data_untargeted_families_baseline_samples.txt'
data_metabolites = pd.read_csv(filename_metabolites, sep="\t", index_col= 'FAMILY_ID').replace(np.nan, 1)
data_metabolites.index.names = ['#OTU ID']   # Use the standard MMvec index name
data_metabolites.shape

(1225, 56)

In [3]:
data_metabolites.head()

Unnamed: 0_level_0,EMR_04_1_JG,EMR_04_10_MD,EMR_04_11_AT,EMR_04_13_KB,EMR_04_15_DB,EMR_04_17_BS,EMR_04_19_BK,EMR_04_20_JP,EMR_04_21_DM,EMR_04_22_RS,...,EMR_04_N-16_PP,EMR_04_N-18_JG,EMR_04_N-19_RC,EMR_04_N-21_AG,EMR_04_N-22_RD,EMR_04_N-29_BP,EMR_04_N-31_JG,EMR_04_N-4_EV,EMR_04_N-5_CA,EMR_04_N-8_PC
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
X940001,1.917066,1.097793,1.102515,1.492751,0.949165,0.658015,1.039717,1.248009,1.311771,1.077183,...,1.14581,1.086979,1.554093,1.347086,1.506364,1.155302,1.206615,0.743653,1.174041,1.349065
X940002,0.937738,1.105094,0.721868,1.301356,0.808781,1.422914,1.331156,1.388068,1.215894,1.232049,...,0.995496,1.04075,0.993713,1.084026,0.535335,0.620074,1.172014,1.050518,1.183928,1.12406
X940005,0.632835,1.223279,0.54794,0.616814,0.525264,0.959493,2.002433,1.084656,0.933708,1.104386,...,0.572647,1.197824,1.390256,1.071936,1.238504,1.008577,0.941165,0.873972,1.113174,1.055592
X940007,1.563082,0.679953,1.444796,1.953196,1.700636,1.860522,1.980057,2.443831,1.066134,2.12019,...,1.775266,0.911157,1.236485,0.800783,1.061433,1.500792,1.064874,2.132671,1.168662,1.533733
X940010,0.688643,0.492065,6.018816,1.734043,3.75945,1.384322,0.378216,1.061845,4.077086,0.702574,...,0.370022,0.827221,0.711317,0.966257,1.264027,2.173998,0.815656,0.941501,0.300463,0.992282


Scale the dataset so that the sum of all features in each sample is 1. This is needed because MMvec expects compositional data for both microbes and metabolites:

In [4]:
data_metabolites = data_metabolites / data_metabolites.sum(axis=0)
data_metabolites.sum(axis=0).head()

EMR_04_1_JG     1.0
EMR_04_10_MD    1.0
EMR_04_11_AT    1.0
EMR_04_13_KB    1.0
EMR_04_15_DB    1.0
dtype: float64

## Microbiome dataset

In [5]:
filename_microbiome = 'data_propi_staph_wide.tsv'
data_microbiome = pd.read_csv(filename_microbiome, sep="\t", index_col='Major_Taxa')
data_microbiome.index.names = ['#OTU ID']   # Use the standard MMvec index name
data_microbiome.shape

(5, 55)

In [6]:
data_microbiome.head()

Unnamed: 0_level_0,EMR_04_10_MD,EMR_04_11_AT,EMR_04_13_KB,EMR_04_15_DB,EMR_04_17_BS,EMR_04_19_BK,EMR_04_1_JG,EMR_04_20_JP,EMR_04_21_DM,EMR_04_22_RS,...,EMR_04_N-13_LB,EMR_04_N-16_PP,EMR_04_N-18_JG,EMR_04_N-19_RC,EMR_04_N-21_AG,EMR_04_N-29_BP,EMR_04_N-31_JG,EMR_04_N-4_EV,EMR_04_N-5_CA,EMR_04_N-8_PC
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Propionibacteriaceae,0.332027,0.666855,0.561543,0.891106,0.953581,0.934267,0.744176,0.857723,0.952992,0.565385,...,0.203518,0.206553,0.874048,0.472923,0.765998,0.898414,0.520952,0.946872,0.874343,0.996344
Staphylococcus caprae or capitis,0.5288,0.332987,0.219763,0.103,0.045315,0.000717,0.254573,0.070421,0.046224,0.000611,...,0.465449,0.786049,0.119447,0.436955,0.23342,0.099802,0.380629,0.052941,0.060375,0.000445
Staphylococcus epidermidis,0.138998,0.000158,0.217896,0.005556,0.001055,0.061597,0.001103,0.070647,0.000734,0.005383,...,0.330953,0.005747,0.001297,0.087936,0.000482,0.001783,0.098053,0.000175,0.043466,0.0
Staphylococcus hominis,0.000174,0.0,0.000789,0.000102,4.9e-05,0.002969,0.000148,0.001209,2.2e-05,0.427676,...,0.0,0.001491,0.003537,0.002185,4.5e-05,0.0,0.000134,3e-06,0.002729,0.003211
Other Staphylococci,0.0,0.0,9e-06,0.000235,0.0,0.000449,0.0,0.0,2.8e-05,0.000946,...,8.1e-05,0.000161,0.00167,0.0,5.5e-05,0.0,0.000232,1e-05,0.019086,0.0


Check if the microbiome dataset is scaled properly:

In [7]:
data_microbiome.sum(axis=0).head()

EMR_04_10_MD    1.0
EMR_04_11_AT    1.0
EMR_04_13_KB    1.0
EMR_04_15_DB    1.0
EMR_04_17_BS    1.0
dtype: float64

## Data consistency

Check whether both dataset have the same sets of columns:

In [8]:
set(data_microbiome.columns) ^ set(data_metabolites.columns)

{'EMR_04_N-22_RD'}

Sample 'EMR_04_N-22_RD' is missing in the microbiome dataset. Remove it also from the metabolites dataset:

In [9]:
data_metabolites = data_metabolites.drop('EMR_04_N-22_RD', axis=1)
data_metabolites.shape

(1225, 55)

In [10]:
set(data_microbiome.columns) ^ set(data_metabolites.columns)

set()

## Biom format

In [11]:
data_metabolites.to_csv(filename_metabolites[:-4]+'_prepared.txt', sep = '\t')
data_microbiome.to_csv(filename_microbiome[:-4]+'_prepared.txt', sep = '\t')
print(filename_metabolites[:-4]+'_prepared.txt')
print(filename_microbiome[:-4]+'_prepared.txt')

metabolite_data_untargeted_families_baseline_samples_prepared.txt
data_propi_staph_wide_prepared.txt


In [12]:
!biom convert -i metabolite_data_untargeted_families_baseline_samples_prepared.txt \
    -o metabolite_data_untargeted_families_baseline_samples_prepared.biom \
    --to-hdf5 --table-type='Metabolite table'

!biom convert -i data_propi_staph_wide_prepared.txt \
    -o data_propi_staph_wide_prepared.biom \
    --to-hdf5 --table-type='OTU table'

## MMvec

Check progress on Tensorboard: Open a new shell, move to the MMvec summary-dir, run "tensorboard --logdir .", and open http://localhost:6006/ in a browser.

Guidelines for optimizing model parameters:

* https://github.com/biocore/mmvec#faqs
* https://saturncloud.io/blog/understanding-tensorflow-batch-size-in-input-placeholder/

In [13]:
import os
import time
import click
import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
from biom import load_table, Table
from biom.util import biom_open
from skbio import OrdinationResults
from skbio.stats.composition import clr, centralize, closure
from skbio.stats.composition import clr_inv as softmax
from scipy.stats import entropy, spearmanr
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
import tensorflow as tf
from tensorflow.contrib.distributions import Multinomial, Normal
from mmvec.multimodal import MMvec
from mmvec.util import split_tables, format_params
import matplotlib.pyplot as plt

  import pandas.util.testing as pdt


In [14]:
microbe_file = 'data_propi_staph_wide_prepared.biom'
metabolite_file = 'metabolite_data_untargeted_families_baseline_samples_prepared.biom'
metadata_file = None
training_column = None
num_testing_examples = 7 
min_feature_count = 1
epochs = 30000
batch_size = 4096
latent_dim = 3
input_prior = .1
output_prior = .1
arm_the_gpu = False
learning_rate = 0.001
beta1 = .9
beta2 = .95
clipnorm = 10.
checkpoint_interval = 1000
summary_interval = 1
summary_dir = 'mmvec_propi_staph'
embeddings_file = None
ranks_file = None
ordination_file = None
equalize_biplot = False

In [15]:
microbes = load_table(microbe_file)
microbes

5 x 55 <class 'biom.table.Table'> with 243 nonzero entries (88% dense)

In [16]:
metabolites = load_table(metabolite_file)
metabolites

1225 x 55 <class 'biom.table.Table'> with 67375 nonzero entries (100% dense)

In [17]:
if metadata_file is not None:
    metadata = pd.read_table(metadata_file, index_col=0)
else:
    metadata = None
    
res = split_tables(
    microbes, metabolites,
    metadata=metadata, training_column=training_column,
    num_test=num_testing_examples,
    min_samples=min_feature_count)

(train_microbes_df, test_microbes_df,
 train_metabolites_df, test_metabolites_df) = res

In [18]:
train_microbes_df.head()

Unnamed: 0,Propionibacteriaceae,Staphylococcus caprae or capitis,Staphylococcus epidermidis,Staphylococcus hominis,Other Staphylococci
EMR_04_13_KB,0.561543,0.219763,0.217896,0.000789,9e-06
EMR_04_15_DB,0.891106,0.103,0.005556,0.000102,0.000235
EMR_04_17_BS,0.953581,0.045315,0.001055,4.9e-05,0.0
EMR_04_19_BK,0.934267,0.000717,0.061597,0.002969,0.000449
EMR_04_1_JG,0.744176,0.254573,0.001103,0.000148,0.0


In [19]:
test_microbes_df

Unnamed: 0,Propionibacteriaceae,Staphylococcus caprae or capitis,Staphylococcus epidermidis,Staphylococcus hominis,Other Staphylococci
EMR_04_10_MD,0.332027,0.5288,0.138998,0.000174,0.0
EMR_04_11_AT,0.666855,0.332987,0.000158,0.0,0.0
EMR_04_44_DTA,0.684542,0.314588,0.000655,0.000216,0.0
EMR_04_48_LD,0.263309,0.723342,0.011894,0.000441,0.001014
EMR_04_68_VG,0.510034,0.015315,0.452309,0.022058,0.000283
EMR_04_N-19_RC,0.472923,0.436955,0.087936,0.002185,0.0


In [20]:
train_metabolites_df.head()

Unnamed: 0,X940001,X940002,X940005,X940007,X940010,X940013,X940014,X940015,X940019,X940021,...,X970731,X970732,X970737,X970745,X970784,X970791,X970813,X970860,X970932,X970960
EMR_04_13_KB,0.001207,0.001052,0.000499,0.001579,0.001402,0.001208,8e-06,0.000993,0.00029,0.002349,...,0.000681,0.000911,0.000508,0.000191,0.000576,0.000268,0.000616,0.00065,0.000475,0.000623
EMR_04_15_DB,0.000609,0.000519,0.000337,0.00109,0.00241,0.000648,0.000713,0.000954,0.000217,0.000216,...,0.000761,0.000494,0.001558,0.000121,0.000682,0.000541,0.000529,0.000675,0.001299,0.000731
EMR_04_17_BS,0.000483,0.001044,0.000704,0.001365,0.001015,0.000772,5.7e-05,0.000941,0.000708,4.3e-05,...,0.000571,0.000589,0.000571,6.9e-05,0.000297,0.000375,0.000893,0.000822,0.000858,0.000818
EMR_04_19_BK,0.000788,0.001009,0.001518,0.001501,0.000287,0.001373,4.6e-05,0.000743,0.001283,0.001121,...,0.000759,0.000904,0.000367,0.000608,0.00066,8.5e-05,0.000543,0.001012,0.000451,0.000364
EMR_04_1_JG,0.001053,0.000515,0.000348,0.000859,0.000378,0.000809,9.7e-05,0.000592,0.000452,0.011874,...,0.000436,0.001521,0.000439,7.5e-05,0.001096,0.000208,0.00044,0.000679,0.000701,9.7e-05


In [21]:
test_metabolites_df

Unnamed: 0,X940001,X940002,X940005,X940007,X940010,X940013,X940014,X940015,X940019,X940021,...,X970731,X970732,X970737,X970745,X970784,X970791,X970813,X970860,X970932,X970960
EMR_04_10_MD,0.000557,0.000561,0.000621,0.000345,0.00025,0.000675,6.9e-05,0.000929,0.00119,0.000441,...,0.000745,0.000557,0.001039,6.5e-05,0.000935,6e-06,0.001302,0.001337,0.001439,0.000112
EMR_04_11_AT,0.000589,0.000386,0.000293,0.000772,0.003218,0.000635,4.8e-05,0.000659,0.000239,5.2e-05,...,0.000611,0.000364,0.000174,6.6e-05,0.001296,0.000104,0.001231,0.001205,0.001336,0.0103
EMR_04_44_DTA,0.0011,0.000779,0.00068,0.00082,0.001336,0.001403,0.000277,0.000806,0.000249,0.001969,...,0.000794,0.000895,0.000581,0.000227,0.000836,0.000476,0.000885,0.000792,0.000674,0.000287
EMR_04_48_LD,0.000917,0.000611,0.000723,0.002124,0.000357,0.001125,0.000101,0.00092,0.00049,0.001,...,0.000302,0.001139,0.000757,0.000169,0.000983,0.000455,0.000452,0.00052,0.000381,0.000467
EMR_04_68_VG,0.000683,0.000551,0.000627,0.000562,0.001528,0.000675,5.7e-05,0.001161,0.000438,0.0001,...,0.000882,0.000512,0.000415,7.9e-05,0.000387,0.000902,0.001187,0.000427,0.000531,0.000296
EMR_04_N-19_RC,0.001183,0.000756,0.001058,0.000941,0.000541,0.000654,6.1e-05,0.000422,0.000409,0.002472,...,0.000504,0.000884,0.000248,0.00025,0.000381,0.001506,0.000904,0.000326,0.000389,0.000665


In [22]:
params = []

sname = 'latent_dim_' + str(latent_dim) + \
       '_input_prior_%.2f' % input_prior + \
       '_output_prior_%.2f' % output_prior + \
       '_beta1_%.2f' % beta1 + \
       '_beta2_%.2f' % beta2

sname = os.path.join(summary_dir, sname)
if embeddings_file is None:
    embeddings_file = sname + "_embedding.txt"
if ranks_file is None:
    ranks_file = sname + "_ranks.txt"
if ordination_file is None:
    ordination_file = sname + "_ordination.txt"

In [23]:
n, d1 = microbes.shape
microbes.shape

(5, 55)

In [24]:
n, d2 = metabolites.shape
metabolites.shape

(1225, 55)

In [25]:
train_microbes_coo = coo_matrix(train_microbes_df.values)
train_microbes_coo

<49x5 sparse matrix of type '<class 'numpy.float64'>'
	with 218 stored elements in COOrdinate format>

In [26]:
test_microbes_coo = coo_matrix(test_microbes_df.values)
test_microbes_coo

<6x5 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in COOrdinate format>

In [27]:
if arm_the_gpu:
    # pick out the first GPU
    device_name='/device:GPU:0'
else:
    device_name='/cpu:0'

config = tf.ConfigProto()

In [28]:
with tf.Graph().as_default(), tf.Session(config=config) as session:
    model = MMvec(
        latent_dim=latent_dim,
        u_scale=input_prior, v_scale=output_prior,
        learning_rate = learning_rate,
        beta_1=beta1, beta_2=beta2,
        device_name=device_name,
        batch_size=batch_size,
        clipnorm=clipnorm, save_path=sname)

    model(session,
          train_microbes_coo, train_metabolites_df.values,
          test_microbes_coo, test_metabolites_df.values)

    loss, cv = model.fit(epoch=epochs, summary_interval=summary_interval,
                         checkpoint_interval=checkpoint_interval)

    pc_ids = list(range(latent_dim))
    vdim = model.V.shape[0]
    V = np.hstack((np.zeros((vdim, 1)), model.V))
    V = V.T
    Vbias = np.hstack((np.zeros(1), model.Vbias.ravel()))

    # Save to an embeddings file
    Uparam = format_params(model.U, pc_ids, list(train_microbes_df.columns), 'microbe')
    Vparam = format_params(V, pc_ids, list(train_metabolites_df.columns), 'metabolite')
    df = pd.concat(
        (
            Uparam, Vparam,
            format_params(model.Ubias, ['bias'], train_microbes_df.columns, 'microbe'),
            format_params(Vbias, ['bias'], train_metabolites_df.columns, 'metabolite')
        ), axis=0)

    df.to_csv(embeddings_file, sep='\t')

    # Save to a ranks file
    ranks = pd.DataFrame(model.ranks(), index=train_microbes_df.columns,
                         columns=train_metabolites_df.columns)

    u, s, v = svds(ranks - ranks.mean(axis=0), k=latent_dim)
    ranks = ranks.T
    ranks.index.name = 'featureid'
    ranks.to_csv(ranks_file, sep='\t')
    # Save to an ordination file
    s = s[::-1]
    u = u[:, ::-1]
    v = v[::-1, :]
    if equalize_biplot:
        microbe_embed = u @ np.sqrt(np.diag(s))
        metabolite_embed = v.T @ np.sqrt(np.diag(s))
    else:
        microbe_embed = u @ np.diag(s)
        metabolite_embed = v.T
    pc_ids = ['PC%d' % i for i in range(microbe_embed.shape[1])]
    features = pd.DataFrame(
        microbe_embed, columns=pc_ids,
        index=train_microbes_df.columns)
    samples = pd.DataFrame(
        metabolite_embed, columns=pc_ids,
        index=train_metabolites_df.columns)
    short_method_name = 'mmvec biplot'
    long_method_name = 'Multiomics mmvec biplot'
    eigvals = pd.Series(s, index=pc_ids)
    proportion_explained = pd.Series(s**2 / np.sum(s**2), index=pc_ids)
    biplot = OrdinationResults(
        short_method_name, long_method_name, eigvals,
        samples=samples, features=features,
        proportion_explained=proportion_explained)
    biplot.write(ordination_file)


Instructions for updating:
Use `tf.random.categorical` instead.

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




100%|██████████| 1596/1596 [01:56<00:00, 13.72it/s]
