In [16]:
# Run order - Independent
# Needed input files: 'chemistries_git.csv', 'chemistries_val_git.csv', ...
# 'cleaned_proteomics.csv', 'proteomics_validation_impute.csv', 'data_discovery.csv'
# Generated output files: None

In [2]:
# Load libraries
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn import model_selection
import seaborn as sns
from string import ascii_letters
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV,LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [3]:
#Importing Data
chemistries_discovery=pd.DataFrame.from_csv('chemistries_git.csv')
chemistries_discovery.index=chemistries_discovery.index.astype('float64')
chemistries_validation=pd.DataFrame.from_csv('chemistries_val_git.csv')
prot=pd.DataFrame.from_csv('cleaned_proteomics.csv')
prot.index=prot.index.astype('float64')
prot_validation=pd.DataFrame.from_csv('proteomics_validation_impute.csv')
prot_validation.index=prot_validation.index.astype('float64')
#metabolomics
discovery_mets=pd.DataFrame.from_csv('data_discovery.csv')
discovery_mets.index=discovery_mets.index.astype('float64')
print (discovery_mets.shape)
print (len(prot))
print (len(prot_validation))
print (len(chemistries_discovery))
print (len(chemistries_validation))
prot.drop(['sex','age'],1,inplace=True)
prot['shannon']=chemistries_discovery['shannon']

(399, 665)
262
176
399
540


In [4]:
#Scale and standardize chemistries
C = chemistries_discovery.loc[:,chemistries_discovery.columns!='shannon']
C_validation = chemistries_validation.loc[:,chemistries_validation.columns!='shannon']
y = chemistries_discovery['shannon']
y_validation = chemistries_validation['shannon']
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
Ccolumns=C.columns
Cindex=C.index
C = scaler.fit_transform(C)
C=pd.DataFrame(data=C,columns=Ccolumns,index=Cindex)
Ccolumns=C_validation.columns
Cindex=C_validation.index
C_validation = scaler.fit_transform(C_validation)
C_validation=pd.DataFrame(data=C_validation,columns=Ccolumns,index=Cindex)

In [5]:
alphas = np.linspace(1,1000,200)
lassocv=LassoCV(eps=0.175, n_alphas=200, alphas=None, fit_intercept=True, normalize=False, precompute='auto', cv=10)
ridgecv=RidgeCV(alphas=alphas,fit_intercept=True,normalize=False,cv=10)
clinical_lasso=cross_val_score(lassocv,C,y,cv=10)
print ('LASSO Clinical Labs mean R2 discovery',np.mean(clinical_lasso))
print ('LASSO Clinical Labs std.dev. R2 discovery',np.std(clinical_lasso))
clinical_ridge=cross_val_score(ridgecv,C,y,cv=10)
print ('Ridge Clinical Labs mean R2 discovery',np.mean(clinical_ridge))
print ('Ridge Clinical Labs std.dev. R2 discovery',np.std(clinical_ridge))

LASSO Clinical Labs mean R2 discovery 0.014287995440638645
LASSO Clinical Labs std.dev. R2 discovery 0.07241550295868539
Ridge Clinical Labs mean R2 discovery 0.0497631171835638
Ridge Clinical Labs std.dev. R2 discovery 0.06413338689515356


In [6]:
clinical_lasso_validation=cross_val_score(lassocv,C_validation,y_validation,cv=10)
print ('LASSO Clinical Labs mean R2 validation cohort',np.mean(clinical_lasso_validation))
print ('LASSO Clinical Labs std.dev. R2 validation cohort',np.std(clinical_lasso_validation))
clinical_ridge_validation=cross_val_score(ridgecv,C_validation,y_validation,cv=10)
print ('Ridge Clinical Labs mean R2 validation cohort',np.mean(clinical_ridge_validation))
print ('Ridge Clinical Labs std.dev. R2 validation cohort',np.std(clinical_ridge_validation))

LASSO Clinical Labs mean R2 validation cohort 0.052603915098809
LASSO Clinical Labs std.dev. R2 validation cohort 0.07047866142625181
Ridge Clinical Labs mean R2 validation cohort 0.08117559470601632
Ridge Clinical Labs std.dev. R2 validation cohort 0.06507843754268144


In [7]:
#Scale and standardize chemistries
p = prot.loc[:,prot.columns!='shannon']
py = prot['shannon']
p_validation = prot_validation.loc[:,prot_validation.columns!='shannon']
y = chemistries_discovery['shannon']
y_validation = prot_validation['shannon']
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
pcolumns=p.columns
pindex=p.index
p = scaler.fit_transform(p)
p=pd.DataFrame(data=p,columns=pcolumns,index=pindex)
print (p.shape)
Ccolumns=p_validation.columns
Cindex=p_validation.index
p_validation = scaler.fit_transform(p_validation)
p_validation=pd.DataFrame(data=p_validation,columns=Ccolumns,index=Cindex)

(262, 263)


In [8]:
p_lasso=cross_val_score(lassocv,p,py,cv=10)
print ('LASSO Proteomics mean R2 discovery cohort',np.mean(p_lasso))
print ('LASSO Proteomics std dev. R2 discovery cohort',np.std(p_lasso))
p_ridge=cross_val_score(ridgecv,p,py,cv=10)
print ('Ridge Proteomics mean R2 discovery cohort',np.mean(p_ridge))
print ('Ridge Proteomics std dev. R2 discovery cohort',np.std(p_ridge))

LASSO Proteomics mean R2 discovery cohort 0.10675946100201453
LASSO Proteomics std dev. R2 discovery cohort 0.0705244710319541
Ridge Proteomics mean R2 discovery cohort 0.1311276853584154
Ridge Proteomics std dev. R2 discovery cohort 0.07370285412511188


In [9]:
alphas = np.linspace(800,1000,200)
ridgecv=RidgeCV(alphas=alphas,fit_intercept=True,normalize=False,cv=10)
lassocv=LassoCV(eps=0.175, n_alphas=200, alphas=None, fit_intercept=True, normalize=False, precompute='auto', cv=10)
p_lasso_validation=cross_val_score(lassocv,p_validation,y_validation,cv=10)
print ('LASSO Proteomics mean R2 validation cohort',np.mean(p_lasso_validation))
print ('LASSO Proteomics std dev. R2 validation cohort',np.std(p_lasso_validation))
p_ridge_validation=cross_val_score(ridgecv,p_validation,y_validation,cv=10)
print ('Ridge Proteomics mean R2 validation cohort',np.mean(p_ridge_validation))
print ('Ridge Proteomics std dev. R2 validation cohort',np.std(p_ridge_validation))

LASSO Proteomics mean R2 validation cohort -0.06853768285695397
LASSO Proteomics std dev. R2 validation cohort 0.23708603639510745
Ridge Proteomics mean R2 validation cohort -0.0007766766339359695
Ridge Proteomics std dev. R2 validation cohort 0.16269620300439858


In [10]:
#joining clinical labs and proteomics
merged = pd.merge(left=prot.drop(['shannon'],1), left_index=True,
                  right=chemistries_discovery, right_index=True,
                  how='inner')
merged_scaled = merged.loc[:,merged.columns!='shannon']
y_merged = merged['shannon']
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
mcolumns=merged_scaled.columns
mindex=merged_scaled.index
M = scaler.fit_transform(merged_scaled)
merged_scaled=pd.DataFrame(data=M,columns=mcolumns,index=mindex)
merged_scaled.shape

(262, 340)

In [11]:
#Running LASSO and Ridge on merged datasets proteomics and clinical labs
ridgecv=RidgeCV(alphas=alphas,fit_intercept=True,normalize=False,cv=10)
lassocv=LassoCV(eps=0.175, n_alphas=200, alphas=None, fit_intercept=True, normalize=False, precompute='auto', cv=10)
merged_lasso=cross_val_score(lassocv,merged_scaled,y_merged,cv=10)
print ('LASSO merged mean R2 discovery cohort',np.mean(merged_lasso))
print ('LASSO merged std dev. R2 discovery cohort',np.std(merged_lasso))
merged_ridge=cross_val_score(ridgecv,merged_scaled,y_merged,cv=10)
print ('Ridge merged mean R2 discovery cohort',np.mean(merged_ridge))
print ('Ridge merged std dev. R2 discovery cohort',np.std(merged_ridge))

LASSO merged mean R2 discovery cohort 0.08854577717811193
LASSO merged std dev. R2 discovery cohort 0.06707821656138153
Ridge merged mean R2 discovery cohort 0.12776704341196027
Ridge merged std dev. R2 discovery cohort 0.06556681826225483


In [12]:
#mets and clinical labs
mets_clinical = pd.merge(left=discovery_mets[discovery_mets.columns[0:659]], left_index=True,
                  right=chemistries_discovery, right_index=True,
                  how='inner')
merged_metsclin = mets_clinical.loc[:,mets_clinical.columns!='shannon']
y_merged = mets_clinical['shannon']
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
mcolumns=merged_metsclin.columns
mindex=merged_metsclin.index
M = scaler.fit_transform(merged_metsclin)
merged_metsclin=pd.DataFrame(data=M,columns=mcolumns,index=mindex)
print (merged_metsclin.shape)
ridgecv=RidgeCV(alphas=alphas,fit_intercept=True,normalize=False,cv=10)
lassocv=LassoCV(eps=0.015, n_alphas=200, alphas=None, fit_intercept=True, normalize=False, precompute='auto', cv=10)
CM_merged_lasso=cross_val_score(lassocv,merged_metsclin,y_merged,cv=10)
print ('LASSO merged mets and clinical labs mean R2 discovery cohort',np.mean(CM_merged_lasso))
print ('LASSO merged mets and clinical labs std dev. R2 discovery cohort',np.std(CM_merged_lasso))
CM_merged_ridge=cross_val_score(ridgecv,merged_metsclin,y_merged,cv=10)
print ('Ridge merged mets and clinical labs mean R2 discovery cohort',np.mean(CM_merged_ridge))
print ('Ridge merged mets and clinical labs std dev. R2 discovery cohort',np.std(CM_merged_ridge))

(399, 736)
LASSO merged mets and clinical labs mean R2 discovery cohort 0.4469113773877204
LASSO merged mets and clinical labs std dev. R2 discovery cohort 0.10532544715046238
Ridge merged mets and clinical labs mean R2 discovery cohort 0.34084265214223053
Ridge merged mets and clinical labs std dev. R2 discovery cohort 0.1161446286583113


In [13]:
#proteins and mets
mets_prots = pd.merge(left=discovery_mets[discovery_mets.columns[0:659]], left_index=True,
                  right=prot, right_index=True,
                  how='inner')
mets_prots_scaled = mets_prots.loc[:,mets_prots.columns!='shannon']
y_merged = mets_prots['shannon']
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
mcolumns=mets_prots_scaled.columns
mindex=mets_prots_scaled.index
Mp = scaler.fit_transform(mets_prots_scaled)
mets_prots_scaled=pd.DataFrame(data=Mp,columns=mcolumns,index=mindex)
print (mets_prots_scaled.shape)

(262, 922)


In [14]:
ridgecv=RidgeCV(alphas=alphas,fit_intercept=True,normalize=False,cv=10)
lassocv=LassoCV(eps=0.015, n_alphas=200, alphas=None, fit_intercept=True, normalize=False, precompute='auto', cv=10)
merged_prot_mets=cross_val_score(lassocv,mets_prots_scaled,y_merged,cv=10)
print ('LASSO mets+prots  mean R2 discovery cohort',np.mean(merged_prot_mets))
print ('LASSO mets+prots std dev. R2 discovery cohort',np.std(merged_prot_mets))

LASSO mets+prots  mean R2 discovery cohort 0.39649531503559776
LASSO mets+prots std dev. R2 discovery cohort 0.08682685882678061


In [15]:
#proteins, mets, and clinical labs
merged = pd.merge(left=prot.drop(['shannon'],1), left_index=True,
                  right=mets_clinical, right_index=True,
                  how='inner')
merged_scaled = merged.loc[:,merged.columns!='shannon']
y_merged = merged['shannon']
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
mcolumns=merged_scaled.columns
mindex=merged_scaled.index
M = scaler.fit_transform(merged_scaled)
merged_scaled=pd.DataFrame(data=M,columns=mcolumns,index=mindex)
print (merged_scaled.shape)
ridgecv=RidgeCV(alphas=alphas,fit_intercept=True,normalize=False,cv=10)
lassocv=LassoCV(eps=0.015, n_alphas=200, alphas=None, fit_intercept=True, normalize=False, precompute='auto', cv=10)
merged_lasso_2=cross_val_score(lassocv,merged_scaled,y_merged,cv=10)
print ('LASSO merged mean R2 discovery cohort',np.mean(merged_lasso_2))
print ('LASSO merged std dev. R2 discovery cohort',np.std(merged_lasso_2))
merged_ridge=cross_val_score(ridgecv,merged_scaled,y_merged,cv=10)
print ('Ridge merged mean R2 discovery cohort',np.mean(merged_ridge))
print ('Ridge merged std dev. R2 discovery cohort',np.std(merged_ridge))

(262, 999)
LASSO merged mean R2 discovery cohort 0.3970910641433278
LASSO merged std dev. R2 discovery cohort 0.08707907416569685
Ridge merged mean R2 discovery cohort 0.34951748682927686
Ridge merged std dev. R2 discovery cohort 0.08965138790911023
