In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_selection import VarianceThreshold
import tensorflow as tf
print("TensorFlow version:")
print(tf.__version__)
print("Check if the GPU is recognized by TensorFlow:")
print(tf.test.is_gpu_available())

TensorFlow version:
1.15.0
Check if the GPU is recognized by TensorFlow:
False


# Metabolomics and microbiome datasets

## Metabolomics dataset

The dataset is on ratio scale. Any missing values will be replaced with one.

In [2]:
filename_metabolites = 'metabolite_data_untargeted_families_baseline_samples.txt'
data_metabolites = pd.read_csv(filename_metabolites, sep="\t", index_col= 'FAMILY_ID').replace(np.nan, 1)
data_metabolites.index.names = ['#OTU ID']   # Use the standard MMvec index name
data_metabolites.shape

(1225, 56)

In [3]:
data_metabolites.head()

Unnamed: 0_level_0,EMR_04_1_JG,EMR_04_10_MD,EMR_04_11_AT,EMR_04_13_KB,EMR_04_15_DB,EMR_04_17_BS,EMR_04_19_BK,EMR_04_20_JP,EMR_04_21_DM,EMR_04_22_RS,...,EMR_04_N-16_PP,EMR_04_N-18_JG,EMR_04_N-19_RC,EMR_04_N-21_AG,EMR_04_N-22_RD,EMR_04_N-29_BP,EMR_04_N-31_JG,EMR_04_N-4_EV,EMR_04_N-5_CA,EMR_04_N-8_PC
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
X940001,1.917066,1.097793,1.102515,1.492751,0.949165,0.658015,1.039717,1.248009,1.311771,1.077183,...,1.14581,1.086979,1.554093,1.347086,1.506364,1.155302,1.206615,0.743653,1.174041,1.349065
X940002,0.937738,1.105094,0.721868,1.301356,0.808781,1.422914,1.331156,1.388068,1.215894,1.232049,...,0.995496,1.04075,0.993713,1.084026,0.535335,0.620074,1.172014,1.050518,1.183928,1.12406
X940005,0.632835,1.223279,0.54794,0.616814,0.525264,0.959493,2.002433,1.084656,0.933708,1.104386,...,0.572647,1.197824,1.390256,1.071936,1.238504,1.008577,0.941165,0.873972,1.113174,1.055592
X940007,1.563082,0.679953,1.444796,1.953196,1.700636,1.860522,1.980057,2.443831,1.066134,2.12019,...,1.775266,0.911157,1.236485,0.800783,1.061433,1.500792,1.064874,2.132671,1.168662,1.533733
X940010,0.688643,0.492065,6.018816,1.734043,3.75945,1.384322,0.378216,1.061845,4.077086,0.702574,...,0.370022,0.827221,0.711317,0.966257,1.264027,2.173998,0.815656,0.941501,0.300463,0.992282


Scale the dataset so that the sum of all features in each sample is 1. This is needed because MMvec expects compositional data for both microbes and metabolites:

In [4]:
data_metabolites = data_metabolites * data_metabolites.shape[0] * 1000 / data_metabolites.sum(axis=0)
#data_metabolites = data_metabolites * data_metabolites.shape[0] * data_metabolites.shape[1] * 1000  / data_metabolites.sum().sum()
data_metabolites.sum(axis=0).head()

EMR_04_1_JG     1225000.0
EMR_04_10_MD    1225000.0
EMR_04_11_AT    1225000.0
EMR_04_13_KB    1225000.0
EMR_04_15_DB    1225000.0
dtype: float64

## Microbiome dataset

In [5]:
filename_microbiome = 'data_major_taxa_wide.tsv'
data_microbiome = pd.read_csv(filename_microbiome, sep="\t", index_col='Major_Taxa')
data_microbiome.index.names = ['#OTU ID']   # Use the standard MMvec index name
data_microbiome.shape

(12, 55)

In [6]:
data_microbiome.head()

Unnamed: 0_level_0,EMR_04_10_MD,EMR_04_11_AT,EMR_04_13_KB,EMR_04_15_DB,EMR_04_17_BS,EMR_04_19_BK,EMR_04_1_JG,EMR_04_20_JP,EMR_04_21_DM,EMR_04_22_RS,...,EMR_04_N-13_LB,EMR_04_N-16_PP,EMR_04_N-18_JG,EMR_04_N-19_RC,EMR_04_N-21_AG,EMR_04_N-29_BP,EMR_04_N-31_JG,EMR_04_N-4_EV,EMR_04_N-5_CA,EMR_04_N-8_PC
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Propionibacteriaceae,0.331976,0.65702,0.541306,0.888762,0.953391,0.9308,0.71458,0.856592,0.942964,0.561774,...,0.202921,0.197956,0.86996,0.447994,0.765815,0.896541,0.515592,0.945658,0.738912,0.996317
Staphylococcus caprae or capitis,0.528719,0.328076,0.211843,0.10273,0.045306,0.000715,0.244449,0.070328,0.045738,0.000607,...,0.464085,0.753332,0.118889,0.413922,0.233364,0.099594,0.376712,0.052873,0.051024,0.000445
Staphylococcus epidermidis,0.138977,0.000156,0.210044,0.005542,0.001055,0.061369,0.001059,0.070554,0.000726,0.005348,...,0.329984,0.005508,0.001291,0.083301,0.000482,0.00178,0.097044,0.000175,0.036734,0.0
Staphylococcus hominis,0.000174,0.0,0.000761,0.000102,4.9e-05,0.002958,0.000142,0.001208,2.2e-05,0.424945,...,0.0,0.001429,0.003521,0.00207,4.5e-05,0.0,0.000132,3e-06,0.002306,0.003211
Other Staphylococci,0.0,0.0,8e-06,0.000234,0.0,0.000448,0.0,0.0,2.8e-05,0.000939,...,8.1e-05,0.000155,0.001662,0.0,5.5e-05,0.0,0.00023,1e-05,0.01613,0.0


Check if the microbiome dataset is scaled properly:

In [7]:
data_microbiome = data_microbiome * data_microbiome.shape[0] * 1000  / data_microbiome.sum(axis=0)
data_microbiome.sum(axis=0).head()

EMR_04_10_MD    12000.0
EMR_04_11_AT    12000.0
EMR_04_13_KB    12000.0
EMR_04_15_DB    12000.0
EMR_04_17_BS    12000.0
dtype: float64

## Data consistency

Check whether both dataset have the same sets of columns:

In [8]:
set(data_microbiome.columns) ^ set(data_metabolites.columns)

{'EMR_04_N-22_RD'}

Sample 'EMR_04_N-22_RD' is missing in the microbiome dataset. Remove it also from the metabolites dataset:

In [9]:
data_metabolites = data_metabolites.drop('EMR_04_N-22_RD', axis=1)
data_metabolites.shape

(1225, 55)

In [10]:
set(data_microbiome.columns) ^ set(data_metabolites.columns)

set()

## Biom format

In [11]:
data_metabolites.to_csv(filename_metabolites[:-4]+'_prepared.txt', sep = '\t')
data_microbiome.to_csv(filename_microbiome[:-4]+'_prepared.txt', sep = '\t')
print(filename_metabolites[:-4]+'_prepared.txt')
print(filename_microbiome[:-4]+'_prepared.txt')

metabolite_data_untargeted_families_baseline_samples_prepared.txt
data_major_taxa_wide_prepared.txt


In [12]:
!biom convert -i metabolite_data_untargeted_families_baseline_samples_prepared.txt \
    -o metabolite_data_untargeted_families_baseline_samples_prepared.biom \
    --to-hdf5 --table-type='Metabolite table'

!biom convert -i data_major_taxa_wide_prepared.txt \
    -o data_major_taxa_wide_prepared.biom \
    --to-hdf5 --table-type='OTU table'

## MMvec

Check progress on Tensorboard: Open a new shell, move to the MMvec summary-dir, run "tensorboard --logdir .", and open http://localhost:6006/ in a browser.

Guidelines for optimizing model parameters:

* https://github.com/biocore/mmvec#faqs
* https://saturncloud.io/blog/understanding-tensorflow-batch-size-in-input-placeholder/

In [13]:
!mmvec paired-omics \
        --microbe-file data_major_taxa_wide_prepared.biom \
        --metabolite-file metabolite_data_untargeted_families_baseline_samples_prepared.biom \
        --min-feature-count 1 --num-testing-examples 15\
        --summary-interval 1 --learning-rate 0.001 --latent-dim 3 --epochs 102400 --batch-size 4096 \
        --input-prior 1 --output-prior 1 \
        --summary-dir mmvec_major_taxa

  import pandas.util.testing as pdt
OMP: Info #155: KMP_AFFINITY: Initial OS proc set respected: 0
OMP: Info #217: KMP_AFFINITY: decoding x2APIC ids.
OMP: Info #217: KMP_AFFINITY: cpuid leaf 11 not supported.
OMP: Info #217: KMP_AFFINITY: decoding legacy APIC ids.
OMP: Info #157: KMP_AFFINITY: 1 available OS procs
OMP: Info #158: KMP_AFFINITY: Uniform topology
OMP: Info #288: KMP_AFFINITY: topology layer "LL cache" is equivalent to "socket".
OMP: Info #192: KMP_AFFINITY: 1 socket x 1 core/socket x 1 thread/core (1 total cores)
OMP: Info #219: KMP_AFFINITY: OS proc to physical thread map:
OMP: Info #172: KMP_AFFINITY: OS proc 0 maps to socket 0 core 0 thread 0 
OMP: Info #255: KMP_AFFINITY: pid 3834 tid 3834 thread 0 bound to OS proc set 0


2023-08-13 18:49:53.827808: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enabl