In [1]:
# Run order - 2
# Needed input files: '_40_coefs.csv', 'data_discovery.csv'
# Generated output files: 'supplementary_table_1.csv'

In [2]:
# Load libraries
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import numpy as np
import pandas as pd
import warnings
from statsmodels.stats import multitest as multi
warnings.filterwarnings("ignore")
%matplotlib inline

In [3]:
#import CSV with 40 identified mets from LASSO and discovery metabolomics dataframe with covariates included
LASSO_coef=pd.read_csv("_40_coefs.csv", index_col=0)
data_clean=pd.read_csv("data_discovery.csv", index_col='public_client_id')
#Get shape, data_clean has BMI,sex, age, and shannon columns added
data_clean.shape

(399, 665)

In [4]:
#Scale and standardize, once mets are scaled/standardized add back covariates
X = data_clean[data_clean.columns[0:659]]
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
Xcolumns=X.columns
X = scaler.fit_transform(X)
X=pd.DataFrame(data=X,columns=Xcolumns,index=data_clean.index)
X['BMI']=data_clean['BMI']
X['Age']=data_clean['Age']
X['shannon']=data_clean['shannon']
X['sex']=data_clean['sex']

In [5]:
#Assess relationship of each metabolite with shannon independently
mets_cov_df=X
#generate list of all metabolites retained by LASSO (40)
mets=LASSO_coef.index.tolist()
#run analysis assessing the relationship of each metabolite and shannon
r_squared=[]
analyte=[]
p=[]
for x in mets:
    mets_cov_df['response']=mets_cov_df[x]
    results = smf.ols('shannon ~ response', data=mets_cov_df).fit()
    p_extract=results.pvalues.tolist()
    p_test=p_extract[1]
    p.append(p_test)
    r_squared.append(results.rsquared*100)
    analyte.append(x)
mets_independent=pd.DataFrame()
mets_independent['analyte']=analyte
mets_independent['p_value']=p
mets_independent['r_squared']=r_squared
mets_independent.set_index('analyte',inplace=True)
results_solo_mets = list([x.split(":")[0].split(".")[-1] for x in mets_independent.index.tolist()])
mets_independent.index=results_solo_mets
mets_independent.sort_values(by='r_squared',ascending=False,inplace=True)
mets_independent.index = [ x.replace('_', ' ') for x in mets_independent.index.tolist()]
#confirm no. of metabolites is 40
print (len(mets_independent))
#check how man mets explain over 10% of variance in Shannon
mets_10rsq=len(mets_independent[mets_independent['r_squared']>10])
print ('no. of mets with >10% variance explained:', mets_10rsq )

40
no. of mets with >10% variance explained: 9


In [6]:
#Assess relationship of each metabolite with Shannon adjusting for covariates
#import df with scaled mets with added covariate columns
mets_cov_df=X
#run regression with shannon as dependent variable and each of the 40 mets as the independent variable, adjusting for BMI, age, and sex
p=[]
analyte=[]
test_value=[]
results=pd.DataFrame()
for x in mets:
    mets_cov_df['response']=X[x]
    results = smf.ols('shannon ~ BMI+Age+sex+response', data=mets_cov_df).fit()
    p_extract=results.pvalues.tolist()
    p_test=p_extract[4]
    analyte.append(x)
    p.append(p_test)
    parameters=results.params.tolist()
    param=parameters[4]
    test_value.append(param)
results_ols=pd.DataFrame()
results_ols['r_squared']=mets_independent['r_squared']
results_ols['analyte']=analyte
results_ols['pvalue']=p
results_ols.set_index('analyte')
#multiple hypothesis correction
results_ols['corr_pval']=multi.multipletests(results_ols['pvalue'], alpha=0.05, method='fdr_bh', is_sorted=False,returnsorted=False)[1]
results_ols['covariate_adjusted_Beta_coeff']=test_value
#confirm no. of metaoblites tested
print (len(results_ols))
results_ols=results_ols.set_index('analyte')
results_ols.head()
results_ols['LASSO_coefficient']=LASSO_coef['mean']
results_ols['No_of_zero_coefficients']=LASSO_coef['zeroes']
results_ols.sort_values(by='LASSO_coefficient',ascending=False,inplace=True)
#no. of metabolites significantly associated with shannon after adjusting for covariates and multiple hypothesis testing
significant=results_ols[results_ols['corr_pval']<0.05].index.tolist()
print ('no. of mets significantly associated with shannon after adjusting for covariates (FDR<0.05)',len(significant))

40
no. of mets significantly associated with shannon after adjusting for covariates (FDR<0.05) 35


In [7]:
#save output
results_ols.to_csv('supplementary_table_1.csv')