In [None]:
# Load libraries
from statsmodels.stats import multitest as multi
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
%matplotlib inline
from sklearn.preprocessing import StandardScaler
import scipy.stats
from scipy import interp
from itertools import cycle
import statsmodels.api as sm

In [None]:
#load dataframe with enterotype assignments, alpha-diversity calculations, and all additional metadata
df=pd.read_csv('statins_df.csv')
#set index
df.set_index('public_client_id',inplace=True)

In [None]:
#Analysis for Figure 1C
#subset only individuals with known statin intensity
q=df[df['statins_binary']==1]
q=q[q['dosage'].isnull()==True]
q=q.index.tolist()
print(len(q))
plot_df=pd.DataFrame(df)
for x in plot_df.index:
    if x in q:
        plot_df.drop([x],0,inplace=True)
plot_df['dosage'].fillna('none',inplace=True)
#check number of participants
plot_df.groupby(by='statins_binary').size()

In [None]:
#generate ordinal variable for dosage (0-3)
dose_num=[]
for x in plot_df['dosage']:
    if x=='none':
        dose_num.append(0)
    elif x=='low':
        dose_num.append(1)
    elif x=='moderate':
        dose_num.append(2)
    elif x=='high':
        dose_num.append(3)
plot_df['dose_num']=dose_num

In [None]:
#Association between statin dosage and HMG (Figure 1E)
formula='HMG~BMI+age+age_sq+sex+dose_num'
model=sm.GLM.from_formula(formula, data=plot_df, family=sm.families.Gamma(link=sm.families.links.log())).fit()
print('Beta(95%CI)',model.params[5],"(",model.conf_int()[5:6][0].tolist(),model.conf_int()[5:6][1].tolist(),")")
print('P-value=',model.pvalues[5])

In [None]:
#Association between statin dosage and HMG (Figure 1E)
#generate adj. HMG variable
formula='HMG~BMI+age+age_sq+sex'
plot_df['HMG_corr']=sm.GLM.from_formula(formula, data=plot_df, family=sm.families.Gamma(link=sm.families.links.log())).fit().resid_pearson+plot_df['HMG'].mean()
#generate plot for Figure 1E
sns.set(font_scale=1.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[8,10], dpi=100)
my_pal = {"none": "lightskyblue", "low": "lightcoral", "moderate":"r","high":"darkred"}
ax=sns.boxplot(plot_df['dosage'],plot_df['HMG_corr'],palette=my_pal,order=['none','low','moderate','high'])
ax = sns.regplot(plot_df['dose_num'],plot_df['HMG_corr'],scatter=False,color='darkred',line_kws={'linewidth':2,'alpha':0.75}); 
plt.show()
#print number of samples in each dosage group
plot_df[plot_df['HMG_corr'].isnull()==False].groupby(by='dosage').size()

In [None]:
#Association between statin dosage and LDL (Figure 1E)
model=smf.ols('LDL~BMI+age+age_sq+sex+chem_vendor+dose_num',data=plot_df).fit()
print('Beta(95%CI)',model.params[6],"(",model.conf_int()[6:7][0].tolist(),model.conf_int()[6:7][1].tolist(),")")
print('P-value=',model.pvalues[6])

In [None]:
#generate adj. LDL variable for Fig.1D
plot_df['LDL_corr']=smf.ols('LDL~BMI+age+age_sq+sex+chem_vendor',data=plot_df).fit().resid+plot_df['LDL'].mean()
#generate plot for Figure 1E
#plot association
sns.set(font_scale=1.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[8,10], dpi=100)
my_pal = {"none": "lightskyblue", "low": "lightcoral", "moderate":"r","high":"darkred"}
ax=sns.boxplot(plot_df['dosage'],plot_df['LDL_corr'],palette=my_pal,order=['none','low','moderate','high'])
ax = sns.regplot(plot_df['dose_num'],plot_df['LDL_corr'],scatter=False,color='darkred',line_kws={'linewidth':2,'alpha':0.75}); 
plt.show()
#print number of samples per group
plot_df[plot_df['LDL_corr'].isnull()==False].groupby(by='dosage').size()

In [None]:
#Figure generation and statistics for Figure 2B
sns.set(font_scale=1.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[6,8], dpi=300)
reg=smf.ols('Shannon~vendor_microbiome+BMI+age+age_sq+sex+statins_binary',data=df).fit()
print('statin association with Shannon')
print('p-value',reg.pvalues[6])
print('Beta-Coef',reg.params[6])
print('95%CI',reg.conf_int()[6:7][0].tolist(),reg.conf_int()[6:7][1].tolist())
#generate adjusted Shannon variable
df['Shannon_corr']=smf.ols('Shannon~vendor_microbiome+BMI+age+age_sq+LDL+sex',data=df).fit().resid+df['Shannon'].mean()
my_pal = {0: "lightskyblue", 1:"r"}
ax=sns.boxplot(df['statins_binary'],df['Shannon_corr'],palette=my_pal,order=[0,1])
plt.show()

In [None]:
#Figure generation and statistics for Figure 2B continued...
sns.set(font_scale=1.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[6,8], dpi=300)
reg=smf.ols('Observed~vendor_microbiome+BMI+age+age_sq+sex+statins_binary',data=df).fit()
print(reg.pvalues[6])
print(reg.params[6])
print('95%CI',reg.conf_int()[6:7][0].tolist(),reg.conf_int()[6:7][1].tolist())
df['Observed_corr']=smf.ols('Observed~vendor_microbiome+BMI+age+age_sq+sex',data=df).fit().resid+df['Observed'].mean()
my_pal = {0: "lightskyblue", 1:"r"}
ax=sns.boxplot(df['statins_binary'],df['Observed_corr'],palette=my_pal,order=[0,1])
plt.show()

In [None]:
#Figure generation and statistics for Figure 2C
sns.set(font_scale=1.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[8,10], dpi=100)
reg_dose=smf.ols('Shannon~vendor_microbiome+BMI+age+sex+age_sq+C(dosage,Treatment(reference="none"))',data=plot_df).fit()
print(reg_dose.params[3])
print(reg_dose.conf_int()[3:4])
print(reg_dose.pvalues[3])
plot_df['Shannon_corr']=smf.ols('Shannon~vendor_microbiome+BMI+age+age_sq+sex',data=df).fit().resid+df['Shannon'].mean()
my_pal = {"none": "lightskyblue", "low": "lightcoral", "moderate":"r","high":"darkred"}
ax=sns.boxplot(plot_df['dosage'],plot_df['Shannon_corr'],palette=my_pal,order=['none','low','moderate','high'])
plt.show()

In [None]:
sns.set(font_scale=1.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[8,10], dpi=100)
reg_dose=smf.ols('Observed~vendor_microbiome+BMI+age+sex+age_sq+C(dosage,Treatment(reference="none"))',data=plot_df).fit()
print(reg_dose.params[3])
print(reg_dose.conf_int()[3:4])
print(reg_dose.pvalues[3])
plot_df['Observed_corr']=smf.ols('Observed~vendor_microbiome+BMI+age+sex+age_sq',data=plot_df).fit().resid+plot_df['Observed'].mean()
my_pal = {"none": "lightskyblue", "low": "lightcoral", "moderate":"r","high":"darkred"}
ax=sns.boxplot(plot_df['dosage'],plot_df['Observed_corr'],palette=my_pal,order=['none','low','moderate','high'])
plt.show()
plot_df[plot_df['Shannon'].isnull()==False].groupby(by='dosage').size()

In [None]:
#Shannon*statin interaction predicting HMG from Figure 2D
print('Shannon*statin interaction predicting HMG')
formula='HMG~vendor_microbiome+BMI+age+age_sq+sex+Shannon*statins_binary'
model=sm.GLM.from_formula(formula, data=df, family=sm.families.Gamma(link=sm.families.links.log())).fit()
print('Beta-Coef',model.params[8])
print('p-value',model.pvalues[8])
print(model.conf_int()[8:9])

In [None]:
#Observed ASVs*statin interaction predicting HMG from Figure 2D
print('Observed*statin interaction predicting HMG')
formula='HMG~vendor_microbiome+BMI+age+age_sq+sex+Observed*statins_binary'
model=sm.GLM.from_formula(formula, data=df, family=sm.families.Gamma(link=sm.families.links.log())).fit()
print('Beta-Coef',model.params[8])
print('p-value',model.pvalues[8])
print(model.conf_int()[8:9])

In [None]:
#plotting Shannon-statin interactions Figure 2D
formula='HMG~vendor_microbiome+BMI+age+age_sq+sex'
#split cohort into tertiles of Shannon diversity
df['Shannon_quant']=pd.qcut(df['Shannon'],3)
df['HMG_corr']=sm.GLM.from_formula(formula, data=df, family=sm.families.Gamma(link=sm.families.links.log())).fit().resid_pearson+df['HMG'].mean()
#plot the boxplots
sns.set(font_scale=1.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[7,10], dpi=300)
my_pal = {0: "lightskyblue", 1: "r"}
ax=sns.boxplot(df['Shannon_quant'],(df['HMG_corr']),hue=df['statins_binary'],palette=my_pal)
ax.legend_.remove()
#check number of samples in each group
df.groupby(['Shannon_quant','statins_binary']).size()

In [None]:
#plotting Observed ASVs-statin interactions Figure 2D
formula='HMG~vendor_microbiome+BMI+age+age_sq+sex'
df['Obs_quant']=pd.qcut(df['Observed'],3)
df['HMG_corr']=sm.GLM.from_formula(formula, data=df, family=sm.families.Gamma(link=sm.families.links.log())).fit().resid_pearson+df['HMG'].mean()
#plot the boxplot
sns.set(font_scale=1.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[7,10], dpi=300)
my_pal = {0: "lightskyblue", 1: "r"}
ax=sns.boxplot(df['Obs_quant'],(df['HMG_corr']),hue=df['statins_binary'],palette=my_pal)
ax.legend_.remove()
#check no. of individuals per group
df.groupby(['Obs_quant','statins_binary']).size()

In [None]:
#statistics and visualization for Figure 2E
formula='HMG~vendor_microbiome+BMI+age+age_sq+sex+dose_num'
plot_df['HMG_corr']=sm.GLM.from_formula(formula, data=plot_df, family=sm.families.Gamma(link=sm.families.links.log())).fit().resid_pearson+plot_df['HMG'].mean()
plot_df['LDL_corr']=smf.ols('LDL~vendor_microbiome+chem_vendor+BMI+age+age_sq+sex+dose_num',data=plot_df).fit().resid+plot_df['LDL'].mean()
sns.set(font_scale=1.,context='poster',font='Arial',style='white')
plt.figure(figsize=[8,8], dpi=200)
#plot scatter of HMG by Observed ASVs in statin users with known therapy intensity
sns.regplot(plot_df['Observed'][plot_df['statins_binary']==1],plot_df['HMG_corr'][plot_df['statins_binary']==1],scatter_kws={'s':125,'alpha': 0.75,'edgecolor':'k'},line_kws={'color':'k'},color='darkred')
d=plot_df[plot_df['Observed'].isnull()==False]
d=d[d['HMG_corr'].isnull()==False]
print(d[d['statins_binary']==0].shape)
print('correlation statin users OBSERVED ASVs',scipy.stats.spearmanr(d['Observed'][d['statins_binary']==1],d['HMG_corr'][d['statins_binary']==1]))
print('correlation non-users OBSERVED ASVs',scipy.stats.spearmanr(d['Observed'][d['statins_binary']==0],d['HMG_corr'][d['statins_binary']==0]))
print('correlation statin users Shannon',scipy.stats.spearmanr(d['Shannon'][d['statins_binary']==1],d['HMG_corr'][d['statins_binary']==1]))
print('correlation non-users Shannon',scipy.stats.spearmanr(d['Shannon'][d['statins_binary']==0],d['HMG_corr'][d['statins_binary']==0]))

In [None]:
sns.set(font_scale=1.,context='poster',font='Arial',style='white')
plt.figure(figsize=[8,8], dpi=200)
sns.regplot(plot_df['Observed'][plot_df['statins_binary']==0],plot_df['HMG_corr'][plot_df['statins_binary']==0],scatter_kws={'s':125,'alpha': 0.75,'edgecolor':'k'},line_kws={'color':'k'},color='lightskyblue')
formula='HMG~vendor_microbiome+BMI+sex+age+age_sq+Observed'
print('beta-coef',sm.GLM.from_formula(formula, data=plot_df[plot_df['statins_binary']==0], family=sm.families.Gamma(link=sm.families.links.log())).fit().params[6])
print('95%CI',sm.GLM.from_formula(formula, data=plot_df[plot_df['statins_binary']==0], family=sm.families.Gamma(link=sm.families.links.log())).fit().conf_int()[6:7])
print('p-value',sm.GLM.from_formula(formula, data=plot_df[plot_df['statins_binary']==0], family=sm.families.Gamma(link=sm.families.links.log())).fit().pvalues[6])

In [None]:
#glm for Figure 1D in statin users
formula='HMG~vendor_microbiome+BMI+sex+age+age_sq+dose_num+Observed'
model=sm.GLM.from_formula(formula, data=plot_df[plot_df['statins_binary']==1], family=sm.families.Gamma(link=sm.families.links.log())).fit()
print('beta-coef',model.params[7])
print('95%CI',model.conf_int()[7:8])
print('p-value',model.pvalues[7])

In [None]:
formula='HMG~vendor_microbiome+BMI+sex+age+age_sq+dose_num+Shannon'
model=sm.GLM.from_formula(formula, data=plot_df[plot_df['statins_binary']==1], family=sm.families.Gamma(link=sm.families.links.log())).fit()
print('beta-coef',model.params[7])
print('95%CI',model.conf_int()[7:8])
print('p-value',model.pvalues[7])

In [None]:
#Exploring statin*alpha diversity interactions with LDL cholesterol as the dependent variable
model_LDL=smf.ols('LDL~chem_vendor+vendor_microbiome+BMI+age+age_sq+sex+Shannon*statins_binary',data=df).fit()
print('Shannon-by-statins interaction p-value',model_LDL.pvalues[9])
print('Shannon-by-statins interaction Beta',model_LDL.params[9],'95%CI',model_LDL.conf_int()[9:10])
model_LDL=smf.ols('LDL~chem_vendor+vendor_microbiome+BMI+age+age_sq+sex+Observed*statins_binary',data=df).fit()
print('Observed ASVs-by-statins interaction p-value',model_LDL.pvalues[9])
print('Observed ASVs-by-statins interaction Beta',model_LDL.params[9],'95%CI',model_LDL.conf_int()[9:10])


In [None]:
#plotting LDL as a function of Shannon diversity in statin users
#adjusted LDL across all statin users
df['LDL_corr']=smf.ols('LDL~chem_vendor+vendor_microbiome+BMI+age+age_sq+sex',data=df).fit().resid+df['LDL'].mean()
#adjusted LDL across all statin users with known intensity (dose_num)
plot_df['LDL_corr']=smf.ols('LDL~chem_vendor+vendor_microbiome+BMI+age+age_sq+sex+dose_num',data=plot_df).fit().resid+plot_df['LDL'].mean()
qf=df[df['LDL_corr'].isnull()==False]
qf=qf[qf['Shannon'].isnull()==False]
print(qf[qf['statins_binary']==1].shape)
d=df[df['Shannon'].isnull()==False]
d=d[d['LDL_corr'].isnull()==False]
print(d[d['statins_binary']==0].shape)
sns.set(font_scale=1.,context='poster',font='Arial',style='white')
plt.figure(figsize=[8,8], dpi=200)
print('correlation statin users LDL-Shannon',scipy.stats.spearmanr(d['Shannon'][d['statins_binary']==1],d['LDL_corr'][d['statins_binary']==1]))
print('correlation statin users LDL-Shannon',scipy.stats.spearmanr(qf['Shannon'][qf['statins_binary']==0],qf['LDL_corr'][qf['statins_binary']==0]))
sns.regplot(qf['Shannon'][qf['statins_binary']==1],qf['LDL_corr'][qf['statins_binary']==1],scatter_kws={'s':125,'alpha': 0.75,'edgecolor':'k'},line_kws={'color':'k'},color='darkred')
reg=smf.ols('LDL~chem_vendor+vendor_microbiome+BMI+age+age_sq+sex+Shannon',data=df[df['statins_binary']==1]).fit()
print(reg.conf_int()[7:8])
print(reg.params[7])
print(reg.pvalues[7])

In [None]:
#Statistics and Visualization for Figure 3F
#log transform HMG for ANOVA analysis
df['log_HMG']=np.log(df['HMG'])
#generate HMG residuals (after adjusting for covariates)
df['HMG_corr']=smf.ols('log_HMG~chem_vendor+vendor_microbiome+sex+age+age_sq+BMI',data=df).fit().resid+df['log_HMG'].mean()
sns.set(font_scale=2.0,context='poster',font='Arial',style='white')
plt.figure(figsize=[10,15], dpi=300)
#generate Figure 3F
ax=sns.boxplot(df['entero'],(df['HMG_corr']),hue=df['statins_binary'],palette=['lightgrey','yellowgreen'])
# generate the right color scheme   
mybox = ax.artists[1]
mybox.set_facecolor('orange')
mybox2 = ax.artists[3]
mybox2.set_facecolor('#C6E2FF')
mybox3 = ax.artists[5]
mybox3.set_facecolor('#4682b4')
ax.legend_.remove()
#run unadjusted anova model
rez=smf.ols('log_HMG~C(entero, Sum)*C(statins_binary,Sum)',data=df).fit()
aov_table = sm.stats.anova_lm(rez, typ=3)
print('unadjusted ANOVA',aov_table)
#run covariate adjusted ANCOVA
rez=smf.ols('log_HMG~chem_vendor+vendor_microbiome+sex+age+age_sq+BMI+C(entero, Sum)*C(statins_binary,Sum)',data=df).fit()
aov_table = sm.stats.anova_lm(rez, typ=3)

print('Adjusted ANCOVA model')
aov_table

In [None]:
#covariate adjusted percent increase in HMG across enterotypes
df['HMG_corr_exp']=np.exp(df['HMG_corr'])
for y in [1,2,3,4]:
        d=df[df['entero']==y]
        print('enterotype',y,'percent increase relative to non-users',(d['HMG_corr_exp'][d['statins_binary']==1].mean()/d['HMG_corr_exp'][d['statins_binary']==0].mean())-1)       

In [None]:
#post-hoc within enterotype comparisons of statin users and non-users
for x in [1,2,3,4]:
    ttest=df[df['entero']==x]
    ttest=ttest[ttest['HMG_corr'].isna()==False]
    print(x)
    print(scipy.stats.ttest_ind(ttest['HMG_corr'][ttest['statins_binary']==0],ttest['HMG_corr'][ttest['statins_binary']==1]))
    print(ttest.groupby(by='statins_binary').size())

In [None]:
#generating table 2 in the paper, statin*enterotype interaction effect p-values
#with different measures of glucose homeostasis as the dependent variable
df['log_LDL']=np.log(df['LDL'])
measure=[]
unadjusted_p=[]
unadjusted_f=[]
cov_adjusted_p=[]
cov_adjusted_f=[]
cov_diab_adjusted_f=[]
cov_diab_adjusted_p=[]
for x in ['HOMA_IR','insulin','glucose','hba1c']:
    df['response']=np.log10(df[x])
    rez=smf.ols('response~C(entero, Sum)*C(statins_binary,Sum)',data=df).fit()
    aov_table = sm.stats.anova_lm(rez, typ=3)
    measure.append(x)
    unadjusted_p.append(aov_table['PR(>F)'][3])
    unadjusted_f.append(aov_table['F'][3])
    rez=smf.ols('response~chem_vendor+vendor_microbiome+age_sq+BMI+HMG+LDL+sex+age+C(entero, Sum)*C(statins_binary,Sum)',data=df).fit()
    aov_table = sm.stats.anova_lm(rez, typ=3)
    cov_adjusted_p.append(aov_table['PR(>F)'][4])
    cov_adjusted_f.append(aov_table['F'][4])
    rez=smf.ols('response~vendor_microbiome+chem_vendor+age_sq+BMI+HMG+LDL+sex+age+diabetes+C(entero, Sum)*C(statins_binary,Sum)',data=df).fit()
    aov_table = sm.stats.anova_lm(rez, typ=3)
    cov_diab_adjusted_p.append(aov_table['PR(>F)'][5])
    cov_diab_adjusted_f.append(aov_table['F'][5])
Results=pd.DataFrame()
Results['measures']=measure
Results['unadjusted_interaction_pvalue']=unadjusted_p
Results['unadjusted_interaction_fvalue']=unadjusted_f
Results['cov_adjusted_interaction_pvalue']=cov_adjusted_p
Results['cov_adjusted_interaction_fvalue']=cov_adjusted_f
Results['cov_diab_adjusted_pvalue']=cov_diab_adjusted_p
Results['cov_diab_adjusted_fvalue']=cov_diab_adjusted_f
#save results to CSV file (presented in table 2)
Results.to_csv('interaction_metabolic_effects.csv')
Results

In [None]:
#calcualte percent median difference between statin users and non-users across measures of glucose homeostasis
#Obtain p-value for the difference between groups using t-test table2
percent=[]
measure=[]
enterotype=[]
p_value=[]
for x in ['HOMA_IR','insulin','glucose','hba1c']:
    df['response']=np.log(df[x])
    df['response_corr']=smf.ols('response~chem_vendor+vendor_microbiome+age_sq+HMG+LDL+sex+age+BMI',data=df).fit().resid+df['response'].mean()   
    for y in [1,2,3,4]:
        d=df[df['entero']==y]
        measure.append(x)
        enterotype.append(y)
        percent.append((((d[x][d['statins_binary']==1].median())/(d[x][d['statins_binary']==0].median()))-1)*100)
        p_value.append(scipy.stats.ttest_ind(d['response_corr'][d['statins_binary']==0],d['response_corr'][d['statins_binary']==1])[1])
results_posthoc=pd.DataFrame()
results_posthoc['measure']=measure
results_posthoc['enterotype']=enterotype
results_posthoc['percent_change']=percent
results_posthoc['p_value']=p_value
#print results
results_posthoc

In [None]:
#Modeling Statin-by-enterotype interaction effects on LDL cholesterol levels

#Unadjusted ANOVA
rez=smf.ols('LDL~C(statins_binary,Sum)*C(entero, Sum)',data=df).fit()
aov_table = sm.stats.anova_lm(rez, typ=3)
print('cov. unadjusted',aov_table)
#adjusted ANOVA
rez=smf.ols('LDL~chem_vendor+vendor_microbiome+age_sq+sex+age+BMI+C(statins_binary,Sum)*C(entero, Sum)',data=df).fit()
aov_table = sm.stats.anova_lm(rez, typ=3)
print('cov. adjusted',aov_table)
#generate cov. adjusted variable (residual)
df['LDL_corr']=smf.ols('LDL~chem_vendor+vendor_microbiome+age_sq+sex+age+BMI',data=df).fit().resid+df['LDL'].mean()
sns.set(font_scale=2.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[20,15], dpi=300)
#plot figure
ax=sns.boxplot(df['entero'],(df['LDL_corr']),hue=df['statins_binary'],palette=['lightgrey','yellowgreen'])
# Select which box you want to change    
mybox = ax.artists[1]
# Change the appearance of that box
mybox.set_facecolor('orange')
mybox2 = ax.artists[3]
mybox2.set_facecolor('#C6E2FF')
mybox3 = ax.artists[5]
mybox3.set_facecolor('#4682b4')
ax.legend_.remove()
#print percent difference in LDL between statin users and non-users across enterotypes and corr. p-value
for x in [1,2,3,4]:
    ttest=df[df['entero']==x]
    ttest=ttest[ttest['LDL_corr'].isna()==False]
    print(x)
    print(scipy.stats.ttest_ind(ttest['LDL_corr'][ttest['statins_binary']==0],ttest['LDL_corr'][ttest['statins_binary']==1]))
    print(ttest.groupby(by='statins_binary').size())
    print('% decrease',1-(ttest['LDL_corr'][ttest['statins_binary']==1].mean()/ttest['LDL_corr'][ttest['statins_binary']==0].mean()))

In [None]:
from tableone import TableOne

In [None]:
#Generate table S1
columns = ['age','BMI','LDL','HOMA_IR','glucose','diabetes','sex','chem_vendor','vendor_microbiome','race']
categorical = ['diabetes','sex','chem_vendor','vendor_microbiome','race']
#df['test']=1
groupby = ['statins_binary']
nonnormal = ['HMG','HOMA_IR']
tableS1 = TableOne(df, columns, categorical, groupby, nonnormal,pval=False)
tableS1.to_csv('tableS1.csv')
tableS1

In [None]:
#Generate table S1
df['all']=1
columns = ['age','BMI','LDL','HOMA_IR','glucose','diabetes','sex','chem_vendor','vendor_microbiome','race']
categorical = ['diabetes','sex','chem_vendor','vendor_microbiome','race']
nonnormal = ['HMG','HOMA_IR']
groupby=['all']
tableS1 = TableOne(df, columns, categorical, groupby,nonnormal,pval=False)
tableS1.to_csv('tableS1_whole.csv')
tableS1

In [None]:
#obesity prevalance in the cohort
df[df['BMI']>=30].shape[0]/df.shape[0]

In [None]:
sns.set(font_scale=2.00,context='poster',font='Arial',style='white')
plt.figure(figsize=[17,15], dpi=300)
df['log_HOMA']=np.log(df['HOMA_IR'])
df['HOMA_corr']=smf.ols('log_HOMA~chem_vendor+vendor_microbiome+age_sq+HMG+LDL+sex+age+BMI',data=df).fit().resid+df['log_HOMA'].mean()
ax=sns.boxplot(df['entero'],(df['HOMA_corr']),hue=df['statins_binary'],palette=['lightgrey','yellowgreen'])
# Select which box you want to change    
mybox = ax.artists[1]
# Change the appearance of that box
mybox.set_facecolor('orange')
mybox2 = ax.artists[3]
mybox2.set_facecolor('#C6E2FF')
mybox3 = ax.artists[5]
mybox3.set_facecolor('#4682b4')
ax.legend_.remove()
d=df[df['entero'].isnull()==False]
d['HOMA_corr'].isnull().sum()

In [None]:
#investigating obesity Bac.2 association using logistic regression
bac2=[]
for x in df['entero']:
    if x==3:
        bac2.append(1)
    elif x in [1,2,4]:
        bac2.append(0)
    else:
        bac2.append(np.NaN)
df['bac2']=bac2
ob=[]
for x in df['BMI']:
    if x>=30:
        ob.append(1)
    elif x<30:
        ob.append(0)
df['ob']=ob
print(df[df['entero'].isnull()==False].groupby(by='ob').size())
formula='bac2~age+age_sq+sex+vendor_microbiome+ob'
res=sm.GLM.from_formula(formula,data=df, family=sm.families.Binomial()).fit()
params = res.params
conf = res.conf_int()
conf['Odds Ratio'] = params
conf.columns = ['5%', '95%', 'Odds Ratio']
print('Bac2~obesity',np.exp(conf))
print(res.pvalues)
formula='bac2~age+age_sq+sex+LDL+vendor_microbiome+statins_binary'
res=sm.GLM.from_formula(formula,data=df[df['ob']==1], family=sm.families.Binomial()).fit()
params = res.params
conf = res.conf_int()
conf['Odds Ratio'] = params
conf.columns = ['5%', '95%', 'Odds Ratio']
print('Bac2~statin_use among obese',np.exp(conf))
res.pvalues

In [None]:
#Associations reported in Table 1
df['Shannon_scaled']=(df['Shannon']-df['Shannon'].mean())/df['Shannon'].std()
df['Observed_scaled']=(df['Observed']-df['Observed'].mean())/df['Observed'].std()
bac2=[]
for x in df['entero']:
    if x==3:
        bac2.append(1)
    elif x in [1,2,4]:
        bac2.append(0)
    else:
        bac2.append(x)
df['bac_2']=bac2
targets=[100,70]
cut_off=[]
measure=[]
p=[]
OR=[]
lower=[]
diab=[]
upper=[]
for x in targets:
    target=[]
    for y in df['LDL']:
        if y<x:
            target.append(1)
        else:
            target.append(0)
    df['target']=target
    for z in ['Shannon_scaled','Observed_scaled','bac_2']:
        df['response']=df[z]
        formula='target~chem_vendor+age+BMI+sex+response'
        res=sm.GLM.from_formula(formula,data=df[df['statins_binary']==1], family=sm.families.Binomial()).fit()
        params = res.params
        conf = res.conf_int()
        conf['Odds Ratio'] = params
        conf.columns = ['5%', '95%', 'Odds Ratio']
        conf=np.exp(conf)
        conf['p-value']=res.pvalues[5]
        print(z,x,conf.iloc[5:])
        cut_off.append(x)
        measure.append(z)
        lower.append(conf.iloc[5,0])
        upper.append(conf.iloc[5,1])
        p.append(res.pvalues[5])
        OR.append(conf.iloc[5,2])
        diab.append(0)
results_Logit=pd.DataFrame()
results_Logit['measure']=measure
results_Logit['LDL_cut_off']=cut_off
results_Logit['p-value']=p
results_Logit['OR']=OR
results_Logit['5%']=lower
results_Logit['95%']=upper

In [None]:
for x in targets:
    target=[]
    for y in df['LDL']:
        if y<x:
            target.append(1)
        else:
            target.append(0)
    df['target']=target
    for z in ['Shannon_scaled','Observed_scaled','bac_2']:
        df['response']=df[z]
        formula='target~chem_vendor+vendor_microbiome+age+BMI+diabetes+sex+response'
        res=sm.GLM.from_formula(formula,data=df[df['statins_binary']==1], family=sm.families.Binomial()).fit()
        params = res.params
        conf = res.conf_int()
        conf['Odds Ratio'] = params
        conf.columns = ['5%', '95%', 'Odds Ratio']
        conf=np.exp(conf)
        conf['p-value']=res.pvalues[7]
        print(z,x,conf.iloc[7:])
        cut_off.append(x)
        measure.append(z)
        lower.append(conf.iloc[7,0])
        upper.append(conf.iloc[7,1])
        p.append(res.pvalues[7])
        OR.append(conf.iloc[7,2])
        diab.append(1)
results_Logit=pd.DataFrame()
results_Logit['measure']=measure
results_Logit['LDL_cut_off']=cut_off
results_Logit['p-value']=p
results_Logit['OR']=OR
results_Logit['5%']=lower
results_Logit['95%']=upper
results_Logit['diab_adjustment']=diab
results_Logit