In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_selection import VarianceThreshold
import tensorflow as tf
print("TensorFlow version:")
print(tf.__version__)
print("Check if the GPU is recognized by TensorFlow:")
print(tf.test.is_gpu_available())

TensorFlow version:
1.15.0
Check if the GPU is recognized by TensorFlow:
False


# Metabolomics and microbiome datasets

## Metabolomics dataset

The dataset is on ratio scale. Any missing values will be replaced with one.

In [2]:
filename_metabolites = 'metabolite_data_untargeted_families_baseline_samples.txt'
data_metabolites = pd.read_csv(filename_metabolites, sep="\t", index_col= 'FAMILY_ID').replace(np.nan, 1)
data_metabolites.index.names = ['#OTU ID']   # Use the standard MMvec index name
data_metabolites.shape

(1225, 56)

In [3]:
data_metabolites.head()

Unnamed: 0_level_0,EMR_04_1_JG,EMR_04_10_MD,EMR_04_11_AT,EMR_04_13_KB,EMR_04_15_DB,EMR_04_17_BS,EMR_04_19_BK,EMR_04_20_JP,EMR_04_21_DM,EMR_04_22_RS,...,EMR_04_N-16_PP,EMR_04_N-18_JG,EMR_04_N-19_RC,EMR_04_N-21_AG,EMR_04_N-22_RD,EMR_04_N-29_BP,EMR_04_N-31_JG,EMR_04_N-4_EV,EMR_04_N-5_CA,EMR_04_N-8_PC
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
X940001,1.917066,1.097793,1.102515,1.492751,0.949165,0.658015,1.039717,1.248009,1.311771,1.077183,...,1.14581,1.086979,1.554093,1.347086,1.506364,1.155302,1.206615,0.743653,1.174041,1.349065
X940002,0.937738,1.105094,0.721868,1.301356,0.808781,1.422914,1.331156,1.388068,1.215894,1.232049,...,0.995496,1.04075,0.993713,1.084026,0.535335,0.620074,1.172014,1.050518,1.183928,1.12406
X940005,0.632835,1.223279,0.54794,0.616814,0.525264,0.959493,2.002433,1.084656,0.933708,1.104386,...,0.572647,1.197824,1.390256,1.071936,1.238504,1.008577,0.941165,0.873972,1.113174,1.055592
X940007,1.563082,0.679953,1.444796,1.953196,1.700636,1.860522,1.980057,2.443831,1.066134,2.12019,...,1.775266,0.911157,1.236485,0.800783,1.061433,1.500792,1.064874,2.132671,1.168662,1.533733
X940010,0.688643,0.492065,6.018816,1.734043,3.75945,1.384322,0.378216,1.061845,4.077086,0.702574,...,0.370022,0.827221,0.711317,0.966257,1.264027,2.173998,0.815656,0.941501,0.300463,0.992282


Scale the dataset so that the sum of all features in each sample is 1. This is needed because MMvec expects compositional data for both microbes and metabolites:

In [4]:
data_metabolites = data_metabolites * data_metabolites.shape[0] * 1000 / data_metabolites.sum(axis=0)
#data_metabolites = data_metabolites * data_metabolites.shape[0] * data_metabolites.shape[1] * 1000  / data_metabolites.sum().sum()
data_metabolites.sum(axis=0).head()

EMR_04_1_JG     1225000.0
EMR_04_10_MD    1225000.0
EMR_04_11_AT    1225000.0
EMR_04_13_KB    1225000.0
EMR_04_15_DB    1225000.0
dtype: float64

## Microbiome dataset

In [5]:
filename_microbiome = 'data_major_taxa_wide.tsv'
data_microbiome = pd.read_csv(filename_microbiome, sep="\t", index_col='Major_Taxa')
data_microbiome.index.names = ['#OTU ID']   # Use the standard MMvec index name
data_microbiome.shape

(12, 55)

In [6]:
data_microbiome.head()

Unnamed: 0_level_0,EMR_04_10_MD,EMR_04_11_AT,EMR_04_13_KB,EMR_04_15_DB,EMR_04_17_BS,EMR_04_19_BK,EMR_04_1_JG,EMR_04_20_JP,EMR_04_21_DM,EMR_04_22_RS,...,EMR_04_N-13_LB,EMR_04_N-16_PP,EMR_04_N-18_JG,EMR_04_N-19_RC,EMR_04_N-21_AG,EMR_04_N-29_BP,EMR_04_N-31_JG,EMR_04_N-4_EV,EMR_04_N-5_CA,EMR_04_N-8_PC
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Propionibacteriaceae,0.331976,0.65702,0.541306,0.888762,0.953391,0.9308,0.71458,0.856592,0.942964,0.561774,...,0.202921,0.197956,0.86996,0.447994,0.765815,0.896541,0.515592,0.945658,0.738912,0.996317
Staphylococcus caprae or capitis,0.528719,0.328076,0.211843,0.10273,0.045306,0.000715,0.244449,0.070328,0.045738,0.000607,...,0.464085,0.753332,0.118889,0.413922,0.233364,0.099594,0.376712,0.052873,0.051024,0.000445
Staphylococcus epidermidis,0.138977,0.000156,0.210044,0.005542,0.001055,0.061369,0.001059,0.070554,0.000726,0.005348,...,0.329984,0.005508,0.001291,0.083301,0.000482,0.00178,0.097044,0.000175,0.036734,0.0
Staphylococcus hominis,0.000174,0.0,0.000761,0.000102,4.9e-05,0.002958,0.000142,0.001208,2.2e-05,0.424945,...,0.0,0.001429,0.003521,0.00207,4.5e-05,0.0,0.000132,3e-06,0.002306,0.003211
Other Staphylococci,0.0,0.0,8e-06,0.000234,0.0,0.000448,0.0,0.0,2.8e-05,0.000939,...,8.1e-05,0.000155,0.001662,0.0,5.5e-05,0.0,0.00023,1e-05,0.01613,0.0


Check if the microbiome dataset is scaled properly:

In [7]:
data_microbiome = data_microbiome * data_microbiome.shape[0] * 1000  / data_microbiome.sum(axis=0)
data_microbiome.sum(axis=0).head()

EMR_04_10_MD    12000.0
EMR_04_11_AT    12000.0
EMR_04_13_KB    12000.0
EMR_04_15_DB    12000.0
EMR_04_17_BS    12000.0
dtype: float64

Scramble microbes

In [8]:
data_microbiome.head()

Unnamed: 0_level_0,EMR_04_10_MD,EMR_04_11_AT,EMR_04_13_KB,EMR_04_15_DB,EMR_04_17_BS,EMR_04_19_BK,EMR_04_1_JG,EMR_04_20_JP,EMR_04_21_DM,EMR_04_22_RS,...,EMR_04_N-13_LB,EMR_04_N-16_PP,EMR_04_N-18_JG,EMR_04_N-19_RC,EMR_04_N-21_AG,EMR_04_N-29_BP,EMR_04_N-31_JG,EMR_04_N-4_EV,EMR_04_N-5_CA,EMR_04_N-8_PC
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Propionibacteriaceae,3983.712,7884.245592,6495.667746,10665.144,11440.695492,11169.604957,8574.963261,10279.107262,11315.564461,6741.286423,...,2435.056558,2375.46671,10439.5236,5375.92308,9189.779079,10758.4968,6187.098042,11347.89474,8866.9416,11955.8028
Staphylococcus caprae or capitis,6344.628,3936.909996,2542.114545,1232.754,543.670908,8.5752,2933.387091,843.937369,548.851139,7.29,...,5569.024242,9039.987921,1426.6632,4967.060196,2800.373721,1195.1316,4520.546105,634.48086,612.2832,5.3436
Staphylococcus epidermidis,1667.724,1.8696,2520.525345,66.4992,12.6552,736.423055,12.7116,846.646969,8.7132,64.180824,...,3959.804,66.090024,15.4932,999.607404,5.778,21.354,1164.528229,2.0952,440.8044,0.0
Staphylococcus hominis,2.0868,0.0,9.1296,1.2216,0.5928,35.499588,1.7064,14.4936,0.2664,5099.334329,...,0.0,17.1432,42.2496,24.8412,0.5364,0.0,1.5864,0.03,27.6756,38.5308
Other Staphylococci,0.0,0.0,0.1008,2.8116,0.0,5.3736,0.0,0.0,0.3348,11.274,...,0.966,1.8552,19.9452,0.0,0.6648,0.0,2.7576,0.114,193.5564,0.0


In [9]:
columns_microbiome = data_microbiome.columns
data_microbiome = data_microbiome.sample(frac=1, axis = 1, replace=False)
data_microbiome.columns = columns_microbiome
data_microbiome.head()

Unnamed: 0_level_0,EMR_04_10_MD,EMR_04_11_AT,EMR_04_13_KB,EMR_04_15_DB,EMR_04_17_BS,EMR_04_19_BK,EMR_04_1_JG,EMR_04_20_JP,EMR_04_21_DM,EMR_04_22_RS,...,EMR_04_N-13_LB,EMR_04_N-16_PP,EMR_04_N-18_JG,EMR_04_N-19_RC,EMR_04_N-21_AG,EMR_04_N-29_BP,EMR_04_N-31_JG,EMR_04_N-4_EV,EMR_04_N-5_CA,EMR_04_N-8_PC
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Propionibacteriaceae,6741.286423,2435.056558,11868.639612,11079.371819,10694.115468,269.547552,6187.098042,5375.92308,11387.931649,840.2028,...,10665.144,8574.963261,11169.604957,11799.164436,3983.712,8808.2328,11686.07886,7080.943908,3156.155685,6495.667746
Staphylococcus caprae or capitis,7.29,5569.024242,10.1352,66.712812,1277.102532,11583.968472,4520.546105,4967.060196,27.5352,2151.558,...,1232.754,2933.387091,8.5752,55.765188,6344.628,226.2732,21.8304,4759.78608,8670.327927,2542.114545
Staphylococcus epidermidis,64.180824,3959.804,60.903588,30.711612,15.4116,124.911576,1164.528229,999.607404,487.503552,0.4332,...,66.4992,12.7116,736.423055,41.738388,1667.724,2742.0672,41.024388,157.192812,142.565988,2520.525345
Staphylococcus hominis,5099.334329,0.0,1.6476,5.5068,1.7664,10.7364,1.5864,24.8412,0.4932,0.1632,...,1.2216,1.7064,35.499588,6.87,2.0868,25.5612,0.0,0.0756,5.2872,9.1296
Other Staphylococci,11.274,0.966,0.4092,16.8204,0.0,0.0,2.7576,0.0,9.2544,104.994,...,2.8116,0.0,5.3736,0.2004,0.0,5.8968,0.5292,0.0,12.15,0.1008


In [10]:
index_microbiome = data_microbiome.index
data_microbiome = data_microbiome.sample(frac=1, replace=False)
data_microbiome.index = index_microbiome
data_microbiome

Unnamed: 0_level_0,EMR_04_10_MD,EMR_04_11_AT,EMR_04_13_KB,EMR_04_15_DB,EMR_04_17_BS,EMR_04_19_BK,EMR_04_1_JG,EMR_04_20_JP,EMR_04_21_DM,EMR_04_22_RS,...,EMR_04_N-13_LB,EMR_04_N-16_PP,EMR_04_N-18_JG,EMR_04_N-19_RC,EMR_04_N-21_AG,EMR_04_N-29_BP,EMR_04_N-31_JG,EMR_04_N-4_EV,EMR_04_N-5_CA,EMR_04_N-8_PC
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Propionibacteriaceae,0.0,0.0,0.0,0.0,1.3872,0.0,1.224,509.191296,3.8112,8847.3408,...,0.0,0.0,0.0,0.0,0.0,2.9844,0.0,0.0,0.0,38.892
Staphylococcus caprae or capitis,0.0,28.044,0.0,288.48366,8.3112,0.0,64.993212,2.9208,0.0,0.0,...,0.0,11.3376,3.0168,0.0,0.0,35.7192,0.0,0.0,0.0,380.947164
Staphylococcus epidermidis,60.616824,3.51,54.234,16.6656,0.3924,0.4596,1.626,32.962812,10.8264,39.7524,...,20.8344,457.885248,1.644,0.0,0.0,0.0,12.2916,0.9492,1.0632,0.6672
Staphylococcus hominis,64.180824,3959.804,60.903588,30.711612,15.4116,124.911576,1164.528229,999.607404,487.503552,0.4332,...,66.4992,12.7116,736.423055,41.738388,1667.724,2742.0672,41.024388,157.192812,142.565988,2520.525345
Other Staphylococci,2.8128,0.93,3.4308,71.154012,0.9384,7.3848,55.484412,87.493212,4.8144,12.9168,...,7.5996,4.7316,36.7032,9.996,1.5228,70.8672,15.7872,0.8676,8.0508,11.7156
Polyomavirus HPyV6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Polyomavirus HPyV7,5099.334329,0.0,1.6476,5.5068,1.7664,10.7364,1.5864,24.8412,0.4932,0.1632,...,1.2216,1.7064,35.499588,6.87,2.0868,25.5612,0.0,0.0756,5.2872,9.1296
Merkel Cell Polyomavirus,12.4032,2.6652,0.6,2.4024,0.0,2.9916,0.0,0.0,9.8988,2.5788,...,3.1356,0.7656,1.5396,63.437988,0.0,1.3968,220.858752,0.1848,0.1932,0.24
Malasseziaceae,6741.286423,2435.056558,11868.639612,11079.371819,10694.115468,269.547552,6187.098042,5375.92308,11387.931649,840.2028,...,10665.144,8574.963261,11169.604957,11799.164436,3983.712,8808.2328,11686.07886,7080.943908,3156.155685,6495.667746
Corynebacteriaceae,11.274,0.966,0.4092,16.8204,0.0,0.0,2.7576,0.0,9.2544,104.994,...,2.8116,0.0,5.3736,0.2004,0.0,5.8968,0.5292,0.0,12.15,0.1008


## Data consistency

Check whether both dataset have the same sets of columns:

In [11]:
set(data_microbiome.columns) ^ set(data_metabolites.columns)

{'EMR_04_N-22_RD'}

Sample 'EMR_04_N-22_RD' is missing in the microbiome dataset. Remove it also from the metabolites dataset:

In [12]:
data_metabolites = data_metabolites.drop('EMR_04_N-22_RD', axis=1)
data_metabolites.shape

(1225, 55)

In [13]:
set(data_microbiome.columns) ^ set(data_metabolites.columns)

set()

## Biom format

In [14]:
data_metabolites.to_csv(filename_metabolites[:-4]+'_prepared.txt', sep = '\t')
data_microbiome.to_csv(filename_microbiome[:-4]+'_scrambled_2_prepared.txt', sep = '\t')
print(filename_metabolites[:-4]+'_prepared.txt')
print(filename_microbiome[:-4]+'_scrambled_2_prepared.txt')

metabolite_data_untargeted_families_baseline_samples_prepared.txt
data_major_taxa_wide_scrambled_2_prepared.txt


In [15]:
!biom convert -i metabolite_data_untargeted_families_baseline_samples_prepared.txt \
    -o metabolite_data_untargeted_families_baseline_samples_prepared.biom \
    --to-hdf5 --table-type='Metabolite table'

!biom convert -i data_major_taxa_wide_scrambled_2_prepared.txt \
    -o data_major_taxa_wide_scrambled_2_prepared.biom \
    --to-hdf5 --table-type='OTU table'

## MMvec

Check progress on Tensorboard: Open a new shell, move to the MMvec summary-dir, run "tensorboard --logdir .", and open http://localhost:6006/ in a browser.

Guidelines for optimizing model parameters:

* https://github.com/biocore/mmvec#faqs
* https://saturncloud.io/blog/understanding-tensorflow-batch-size-in-input-placeholder/

In [18]:
!mmvec paired-omics \
        --microbe-file data_major_taxa_wide_scrambled_2_prepared.biom \
        --metabolite-file metabolite_data_untargeted_families_baseline_samples_prepared.biom \
        --min-feature-count 1 --num-testing-examples 15\
        --summary-interval 1 --learning-rate 0.001 --latent-dim 3 --epochs 102400 --batch-size 4096 \
        --input-prior 100 --output-prior 100 \
        --summary-dir mmvec_major_taxa_scrambled_2

  import pandas.util.testing as pdt
OMP: Info #155: KMP_AFFINITY: Initial OS proc set respected: 0
OMP: Info #217: KMP_AFFINITY: decoding x2APIC ids.
OMP: Info #217: KMP_AFFINITY: cpuid leaf 11 not supported.
OMP: Info #217: KMP_AFFINITY: decoding legacy APIC ids.
OMP: Info #157: KMP_AFFINITY: 1 available OS procs
OMP: Info #158: KMP_AFFINITY: Uniform topology
OMP: Info #288: KMP_AFFINITY: topology layer "LL cache" is equivalent to "socket".
OMP: Info #192: KMP_AFFINITY: 1 socket x 1 core/socket x 1 thread/core (1 total cores)
OMP: Info #219: KMP_AFFINITY: OS proc to physical thread map:
OMP: Info #172: KMP_AFFINITY: OS proc 0 maps to socket 0 core 0 thread 0 
OMP: Info #255: KMP_AFFINITY: pid 995 tid 995 thread 0 bound to OS proc set 0


2023-08-13 16:19:00.499574: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable 