This notebook outlines the code used for Figure 3 classifiers. The first several cells are Figure 3a and the last Figure 3b.

In [None]:
# Normal utilities
import numpy as np
import pandas as pd

# For building trees
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import scipy.cluster.hierarchy as shc

def name_detection_id(d_id, chemical_info, condense=True):
    '''Determine the name of a detection according to hardcoded rules.

    Parameters
    ----------
    d_id : str
        The detection id.
    chemical_info : pd.DataFrame
        The chemical information dataframe.
    condense : bool
        If True (default) then reports only unique compound id's in final
        string. Helpful when the library lists several co-eluting
        stereoisomers.
    '''

    tmp = chemical_info.loc[(chemical_info['dname'] == d_id),
                            ['Compound', 'Peak']]

    name = []
    for _, row in tmp.iterrows():
        if pd.notnull(row[1]):
            n = row[0] + '_' + row[1]
        else:
            n = row[0]
        name.append(n)

    if condense:
        name = sorted(set(name))
    else:
        name = sorted(name)

    return ','.join(name)

In [None]:
# Metabolomics data
# Combined sample database
md_fp = '../supplemental_table_5.xlsx'
md = pd.read_excel(md_fp, index_col=0, sheet_name='mf')

# Chemical info
cpd_lib_fp = '../Supplementary_Table_1_mz-rt_library.xlsx'
ci = pd.read_excel(cpd_lib_fp, sheet_name='chemical_info', index_col=0)
chemical_info = pd.read_excel(cpd_lib_fp, sheet_name='chemical_info')
istds = ci.loc[['IS_' in i for i in ci['Compound']], :].index.values

name_translator = {c: name_detection_id(c, chemical_info) for c in
                   set(chemical_info['dname'].values)}

# Taxonomic info
sac_fp = '../supplemental_table_6.xlsx'
taxonomies = pd.read_excel(sac_fp, index_col=0, sheet_name='full_taxonomy')

# agg_md
agg_md = md = pd.read_excel('../supplemental_table_7.xlsx', sheet_name='aggregated_md', index_col=0)

In [None]:
# Aggregated data
base_fp = '../supplemental_table_7.xlsx'

# metabolomics data
raw_data = pd.read_excel(io=base_fp,
                         sheet_name='foldchange.dmrvf.fa.ps',
                         index_col=0)

# setup
mega_media_samples = ((agg_md['sample_type'] == 'supernatant') &
                      (agg_md['media'] == 'mm'))

ss_md = agg_md.loc[mega_media_samples, :]

levels = ['phylum', 'class', 'order', 'family', 'genus', '_species',
          'taxonomy']

tmp = []
for _, (genus, species, strain) in ss_md[['genus', 'species', 'strain']].iterrows():
    if pd.isnull(species):
        tmp.append('%s %s' % (genus, strain))
    else:
        tmp.append('%s %s' % (genus, species))
ss_md['_species'] = tmp 

levels = ['phylum', 'class', 'order', 'family', 'genus', '_species',
          'taxonomy']

 # Take MM samples only. Remove ISTDs.
data = raw_data.loc[mega_media_samples,
                    ~np.in1d(raw_data.columns, istds)]
# We will remove data that is mostly nan - probably not good classification.
# Fill the rest with 0's. Also create a copy of data for plotting. Don't remove
# nan's here.
data_rf = data.loc[:, data.isnull().sum(0) < 300].fillna(0)
data_plots = data.loc[:, data_rf.columns]

results = []
for level in levels:
    for forest_num in range(25):
        # The label we are classifying.
        labels = ss_md[level].copy()

        train_data, test_data, train_labels, test_labels = \
            train_test_split(data_rf, labels, test_size=0.33)

        # rf = RandomForestClassifier(n_estimators=50, max_depth=5, bootstrap=False,
        #                             max_features=100)
        n_labels = len(set(labels))
        rf = RandomForestClassifier(n_estimators=50, max_depth=5,
                                    bootstrap=False, max_features=100)

        rf.fit(train_data, train_labels)

        predictions = rf.predict(test_data)

        correct = (predictions == test_labels).sum()
        incorrect = len(predictions) - correct
        results.append([level, forest_num, correct, incorrect])
        print(level, forest_num)


This runs a sample classification like Figure 3b.

In [None]:
# Our target is phylum level classification as binary between Bacteroidetes and Other
labels = ss_md['phylum'].copy()
good_labels = ['Bacteroidetes']
labels[~np.in1d(labels, good_labels)] = 'Other'

# Take MM samples only. Remove ISTDs.
data = raw_data.loc[mega_media_samples, ~np.in1d(raw_data.columns, istds)]

# We will remove data that is mostly nan - probably not good classification.
# Fill the rest with 0's. Also create a copy of data for plotting. Don't remove
# nan's here.
data_rf = data.loc[:, data.isnull().sum(0) < 300].fillna(0)
data_plots = data.loc[:, data_rf.columns]

train_data, test_data, train_labels, test_labels = \
    train_test_split(data_rf, labels, test_size=0.33)

# rf = RandomForestClassifier(n_estimators=50, max_depth=5, bootstrap=False,
#                             max_features=100)
rf = RandomForestClassifier(n_estimators=1000, max_depth=5, bootstrap=False,
                            max_features=50)
rf.fit(train_data, train_labels)

predictions = rf.predict(test_data)

cm = confusion_matrix(test_labels, predictions, labels=rf.classes_)
cm = pd.DataFrame(cm, index=rf.classes_, columns=rf.classes_)

#These are the misses
misses = ss_md.loc[test_data.index[(predictions != test_labels)]]

# Learn the feature importance scores
fi = pd.DataFrame(rf.feature_importances_, index=data_rf.columns,
                  columns=['importance']).sort_values('importance',
                                                      ascending=False)
fi['cpd'] = [name_detection_id(i, chemical_info) for i in fi.index]

m1 = (np.diag(cm) / cm.sum(1)).mean()
m2 = np.diag(cm).sum() / cm.sum(1).sum()

tmp1 = data_plots.loc[:, fi.index[:10]]
tmp1['phylum'] = agg_md.loc[tmp1.index, 'phylum']

tmp2 = tmp1.groupby('phylum').median()

z = shc.linkage(tmp2, method='ward')
d = shc.dendrogram(z)
z2 = shc.linkage(tmp2.T, method='ward')
d2 = shc.dendrogram(z2)
hm_data = tmp2.iloc[d['leaves'], d2['leaves']]