This notebook runs the code that was used to check for supernatants that appear to unusual given their phylum.

In [None]:
# Normal utilities
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# For building trees
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 

# Metabolomics data
base_fp = 'supplemental_table_7.xlsx'
log2_istdtic = np.log2(1 + pd.read_excel(io=base_fp, sheet_name='count.ps', index_col=0))

fc = np.log2(1 + pd.read_excel(io=base_fp, sheet_name='foldchange', index_col=0))
fc_fc = np.log2(1 + pd.read_excel(io=base_fp, sheet_name='foldchange.fa.ps_log2', index_col=0))

# Because Seaborn is bad about interpreting numeric columns as discrete color
# categories, we'll make some modifications here.
j_agg_md = pd.read_excel(io=base_fp, sheet_name='aggregated_md', index_col=0)
j_agg_md['str_exp'] = list(map(lambda x: 'exp_' + str(int(x)), 
                               j_agg_md['experiment']))

# Master sample database
master_sample_database_fp = 'supplemental_table_5.xlsx'
md = pd.read_excel(io=master_sample_database_fp, index_col=0, sheet_name='mf')

cpd_lib_fp = 'Supplementary_Table_1_mz-rt_library.xlsx'
ci = pd.read_excel(cpd_lib_fp, sheet_name='chemical_info', index_col=0)
chemical_info = pd.read_excel(cpd_lib_fp, sheet_name='chemical_info')

istds = ci.loc[['IS_' in i for i in ci['Compound']], :].index.values

# setup
mega_media_samples = ((j_agg_md['sample_type'] == 'supernatant') &
                      (j_agg_md['media'] == 'mm'))

ss_md = j_agg_md.loc[mega_media_samples, :]


# Alter the datasets so that istds and non-mm samples are eliminated
data_log2_istdtic = log2_istdtic.loc[mega_media_samples,
                                      ~np.in1d(log2_istdtic.columns, istds)]
data_fc = fc.loc[mega_media_samples, ~np.in1d(fc.columns, istds)]
data_fc_fc = fc_fc.loc[mega_media_samples, ~np.in1d(fc_fc.columns, istds)]


assert (data_log2_istdtic.index == ss_md.index).all()
assert (data_fc.index == ss_md.index).all()
assert (data_fc_fc.index == ss_md.index).all()

### Ordering variables we don't want to change
DATASETS = [data_log2_istdtic, data_fc, data_fc_fc, data_log2_istdtic,
            data_log2_istdtic, data_log2_istdtic, data_log2_istdtic,
            data_log2_istdtic]
# Make the dataset labels. The last 5 experiments are group vs 'Other'.
PHYLUM_ORDER = ['Actinobacteria', 'Bacteroidetes', 'Firmicutes',
                'Fusobacteria', 'Proteobacteria', 'Other']
DATASET_LABELS = [ss_md['phylum'].copy(), ss_md['phylum'].copy(),
                  ss_md['phylum'].copy()]
for label in PHYLUM_ORDER[:-1]:
    tmp_labels = ss_md['phylum'].copy()
    tmp_labels[~np.in1d(tmp_labels, [label])] = 'Other'
    DATASET_LABELS.append(tmp_labels)

SAMPLE_TO_ROW_INDEXER = {sample_idx: row_idx for row_idx, sample_idx in
                         enumerate(ss_md.index)}

METABOLITE_ORDER = data_log2_istdtic.columns.copy()
N_METABOLITES = METABOLITE_ORDER.shape[0]
METABOLITE_TO_COL_INDEXER = {metabolite: col_idx for col_idx, metabolite in
                             enumerate(METABOLITE_ORDER)}

N_EXPERIMENTS = 8
N_TRIALS = 80 
TRIALS = ['trial%s' % str(trial).zfill(2) for trial in range(N_TRIALS)]

# RF parameters
FORESTS = 50
TREES_PER_FOREST = 50
MAX_DEPTH = 7
MAX_FEATURES = 50


# Create the collated accuracies data
_cols = ['str_exp', 'media', 'c18positive', 'c18negative', 'hilicpositive',
         'culture_source', 'taxonomy', 'phylum']

collated_accuracies = ss_md[_cols].copy()

for trial in TRIALS:
    collated_accuracies[trial] = np.zeros(collated_accuracies.shape[0])

collated_accuracies['accuracy'] = np.zeros(collated_accuracies.shape[0])

# Per trial accuracies
full_trial_results = {}

# Trial parameters DF
trial_parameters = pd.DataFrame(np.zeros((N_TRIALS, 2)), index=TRIALS, 
                                columns=['dataset', 'nfeatures'])

# Trial feature importances
trial_feature_importances = pd.DataFrame(np.zeros((N_TRIALS, N_METABOLITES)),
                                         index=TRIALS,
                                         columns=METABOLITE_ORDER)

cur_trial_n = 0
for exp_counter, (dataset, labels) in enumerate(zip(DATASETS, DATASET_LABELS)):
    non_nan_thresholds = np.percentile(dataset.isnull().sum(0), 
                                       np.linspace(10, 100, 10))

    for iter_counter, th in enumerate(non_nan_thresholds):
        print('exp: %s\ntrial: %s\n' % (exp_counter, iter_counter))
        cur_trial = 'trial%s' % str(cur_trial_n).zfill(2)
        cur_trial_n += 1

        data_rf = dataset.loc[:, dataset.isnull().sum(0) < round(th)].fillna(0)

        # Data for the trial parameters
        trial_parameters.loc[cur_trial] = [exp_counter, data_rf.shape[1]]

        # Data for feature importances aggregated across the forests in this
        # trial.
        _feature_importances = np.zeros((FORESTS, N_METABOLITES))

        _trial_results = []
        for forest in range(FORESTS):
            print('forest: %s' % forest)
            train_data, test_data, train_labels, test_labels = \
                train_test_split(data_rf, labels, test_size=0.33)

            # Number of features must be at least MAX_FEATURES.
            _max_features = min(MAX_FEATURES, train_data.shape[1])
            rf = RandomForestClassifier(n_estimators=TREES_PER_FOREST,
                                        max_depth=MAX_DEPTH,
                                        bootstrap=False,
                                        max_features=_max_features)

            rf.fit(train_data, train_labels)

            predictions = rf.predict(test_data)

            # Record feature importance data.
            tmp = [METABOLITE_TO_COL_INDEXER[i] for i in train_data.columns]
            _feature_importances[forest][tmp] = rf.feature_importances_.copy()

            _trial_results.append((test_data.index, predictions))

        # Calculate aggregate feature importance scores.
        trial_feature_importances.loc[cur_trial] = _feature_importances.mean(0)


        # Aggregate trial accuracies.
        tmp = np.zeros((ss_md.shape[0], 6), dtype=np.float32)
        col_indexer = {phylum: i for i, phylum in enumerate(PHYLUM_ORDER)}

        for idxs, guesses in _trial_results:
            for i, j in zip(idxs, guesses):
                tmp[SAMPLE_TO_ROW_INDEXER[i], col_indexer[j]] += 1

        trial_result = pd.DataFrame(tmp / np.expand_dims(tmp.sum(1), 1),
                                    index=ss_md.index, 
                                    columns=PHYLUM_ORDER)
        full_trial_results[cur_trial] = trial_result

        tmp_accuracy = np.zeros(trial_result.shape[0])
        for _idx in range(trial_result.shape[0]):
            true_label = ss_md.iloc[_idx]['phylum']
            tmp_accuracy[_idx] = trial_result.iloc[_idx,
                                                   col_indexer[true_label]]
        
        collated_accuracies[cur_trial] = tmp_accuracy