In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import re
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
np.random.seed(42)

# INCORECT METHOD NOTICE

Standard scaler was applied to all the data prior to splitting it. This is an incorrect approach and affects all downstream analysis. 

I do not have time to rerun correctly


In [23]:
Genes = pd.read_csv("ClusterDataset.csv")
Genes = Genes[Genes["Name"]!="gene_variance"]

In [24]:
Subjects = pd.read_csv("SubjectsMeta.txt",sep='\t')

In [None]:
#get all genecolumns 
gene_cols = [g for g in Genes.columns if g.startswith("ENSG")]
#Remove gene column and features column convert to numpy array
gene_array = Genes.iloc[:,1:-1].to_numpy()
norm_genes = StandardScaler().fit_transform(gene_array)
data_genes = pd.DataFrame(norm_genes,columns=gene_cols)
data_genes["Name"]=Genes["Name"]
data_genes["tissue"]=Genes["tissue"]


In [26]:
#use regex to get subjid take GTEX followed by sep char then match 1 or more char that are not that sep
pattern = r'^(GTEX\x2D[^\x2D]+)'
data_genes["SUBJID"] = data_genes["Name"].apply(lambda x: re.match(pattern,x))
data_genes["SUBJID"] = data_genes["SUBJID"].apply(lambda x: x.group(1))
#Merge gene and subject data
GeneSubj = pd.merge(data_genes,Subjects[["SUBJID","SEX","AGE"]],on="SUBJID")

In [27]:
#Create continuous variable for age by taking average
def getavgAge(x):
    low,high=x.split("-")
    avg=(int(low)+int(high))/2
    return avg
GeneSubj["age_continuous"] = GeneSubj["AGE"].apply(lambda x: getavgAge(x))

In [28]:
GeneSubj.head(2)

Unnamed: 0,ENSG00000244734.3,ENSG00000210082.2,ENSG00000198804.2,ENSG00000198712.1,ENSG00000198938.2,ENSG00000188536.12,ENSG00000198899.2,ENSG00000198886.2,ENSG00000275896.5,ENSG00000163220.10,...,ENSG00000134291.11,ENSG00000183578.6,ENSG00000164091.11,ENSG00000173418.11,Name,tissue,SUBJID,SEX,AGE,age_continuous
0,-0.189282,-0.638404,-0.912095,-1.14554,-0.795922,-0.192404,-1.089657,-1.153692,-0.132456,-0.248378,...,-0.841136,0.994222,0.596135,0.676412,GTEX-1117F-0226-SM-5GZZ7,Adipose Tissue,GTEX-1117F,2,60-69,64.5
1,-0.192726,-0.486328,0.055921,0.109423,1.286259,-0.194876,0.89027,-0.001923,-0.129289,-0.248983,...,-1.181526,-0.709254,-0.593598,1.388128,GTEX-1117F-0426-SM-5EGHI,Muscle,GTEX-1117F,2,60-69,64.5


## ALL TISSUE AGE REGRESSION

In [None]:
#Create dataset for age regressor
AgeRegressor = GeneSubj.drop(columns=["Name","tissue","SUBJID","SEX","AGE"]).dropna().copy()
X = AgeRegressor.iloc[:,:-1] #remove age_continuous col
y_age = AgeRegressor["age_continuous"]

#use stratify to preserve tissue distribition in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_age, test_size=0.2, stratify=y_age, random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

### Dummy Regressor

Change classifier code fromtissue classifier to work for regression

In [34]:
print("DummyRegressor Training and Testing...")
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y_train)
pred_dummy = dummy.predict(X_test)
print("Dummy MAE", mean_absolute_error(y_test, pred_dummy))
print("Dummy RMSE", np.sqrt(mean_squared_error(y_test, pred_dummy)))
print("Dummy R2", r2_score(y_test, pred_dummy))

DummyRegressor Training and Testing...
Dummy MAE 9.900990502440813
Dummy RMSE 12.645937171974662
Dummy R2 -2.983029834258133e-07


### Random Forest Regressor

In [36]:
print("RadnomForest Training and Testing...")
rf = RandomForestRegressor(random_state=42, n_jobs=1)
rf_grid = {"n_estimators":[200,800],"max_depth":[None,10],"max_features":["sqrt"],"bootstrap":[True]}
rf_search = GridSearchCV(rf, rf_grid, scoring="r2", cv=cv, n_jobs=104, return_train_score=True, verbose=3)
rf_search.fit(X_train, y_train)
best_rf = rf_search.best_estimator_
pred_rf = best_rf.predict(X_test)
print("RF best params", rf_search.best_params_)
print("RF MAE", mean_absolute_error(y_test, pred_rf))
print("RF RMSE", np.sqrt(mean_squared_error(y_test, pred_rf)))
print("RF R2", r2_score(y_test, pred_rf))

RadnomForest Training and Testing...
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[CV 4/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.485, test=0.177) total time= 1.6min
[CV 3/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.480, test=0.186) total time= 1.7min
[CV 5/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.486, test=0.179) total time= 1.7min
[CV 1/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.479, test=0.178) total time= 1.7min
[CV 2/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.481, test=0.173) total time= 1.7min
[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, n_estimators=200;, score=(train=0.892, test=0.208) total time= 2.7min
[CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, n_estimators=200;, score=(train=0.891, test=0.206) total time= 2.7min
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, n_estima

### AdaBoost Regressor

In [37]:
print("AdaBoost Training and Testing...")
stump = DecisionTreeRegressor(max_depth=1, random_state=42)
ada = AdaBoostRegressor(estimator=stump, random_state=42)
ada_grid = {"n_estimators": [3, 50],"learning_rate": [0.1, 0.3]}
ada_search = GridSearchCV(ada, ada_grid, scoring="r2", cv=cv, n_jobs=104, return_train_score=True, verbose=3)
ada_search.fit(X_train, y_train)
best_ada = ada_search.best_estimator_
pred_ada = best_ada.predict(X_test)
print("Ada best params", ada_search.best_params_)
print("Ada MAE", mean_absolute_error(y_test, pred_ada))
print("Ada RMSE", np.sqrt(mean_squared_error(y_test, pred_ada)))
print("Ada R2", r2_score(y_test, pred_ada))

AdaBoost Training and Testing...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 5/5] END learning_rate=0.3, n_estimators=3;, score=(train=0.051, test=0.047) total time=  20.4s
[CV 1/5] END learning_rate=0.1, n_estimators=3;, score=(train=0.051, test=0.048) total time=  20.5s
[CV 4/5] END learning_rate=0.3, n_estimators=3;, score=(train=0.050, test=0.037) total time=  20.5s
[CV 3/5] END learning_rate=0.1, n_estimators=3;, score=(train=0.042, test=0.036) total time=  20.5s
[CV 5/5] END learning_rate=0.1, n_estimators=3;, score=(train=0.057, test=0.051) total time=  20.6s
[CV 2/5] END learning_rate=0.3, n_estimators=3;, score=(train=0.054, test=0.050) total time=  20.6s
[CV 2/5] END learning_rate=0.1, n_estimators=3;, score=(train=0.056, test=0.051) total time=  20.8s
[CV 1/5] END learning_rate=0.3, n_estimators=3;, score=(train=0.059, test=0.057) total time=  20.8s
[CV 4/5] END learning_rate=0.1, n_estimators=3;, score=(train=0.043, test=0.029) total time=  21.5s
[CV 3/5

# BLOOD ONLY REGRESSION

In [38]:
#Create dataset for age regressor
BloodOnly = GeneSubj[GeneSubj["tissue"]=="Blood"]
BloodOnly = BloodOnly.drop(columns=["Name","tissue","SUBJID","SEX","AGE"]).dropna().copy()
X = BloodOnly.iloc[:,:-1] #remove age_continuous col
y_age = BloodOnly["age_continuous"]

#use stratify to preserve tissue distribition in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_age, test_size=0.2, stratify=y_age, random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [39]:
print("BloodOnly DummyRegressor Training and Testing...")
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y_train)
pred_dummy = dummy.predict(X_test)
print("Dummy MAE", mean_absolute_error(y_test, pred_dummy))
print("Dummy RMSE", np.sqrt(mean_squared_error(y_test, pred_dummy)))
print("Dummy R2", r2_score(y_test, pred_dummy))

BloodOnly DummyRegressor Training and Testing...
Dummy MAE 10.573163142737231
Dummy RMSE 13.036766267507994
Dummy R2 -2.4129801332151146e-05


In [40]:
print("BloodOnly RandomForest Training and Testing...")
rf = RandomForestRegressor(random_state=42, n_jobs=1)
rf_grid = {"n_estimators":[200,800],"max_depth":[None,10],"max_features":["sqrt"],"bootstrap":[True]}
rf_search = GridSearchCV(rf, rf_grid, scoring="r2", cv=cv, n_jobs=104, return_train_score=True, verbose=3)
rf_search.fit(X_train, y_train)
best_rf = rf_search.best_estimator_
pred_rf = best_rf.predict(X_test)
print("RF best params", rf_search.best_params_)
print("RF MAE", mean_absolute_error(y_test, pred_rf))
print("RF RMSE", np.sqrt(mean_squared_error(y_test, pred_rf)))
print("RF R2", r2_score(y_test, pred_rf))

BloodOnly RandomForest Training and Testing...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.857, test=0.110) total time=   4.3s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.842, test=0.200) total time=   4.3s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.844, test=0.234) total time=   4.3s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.853, test=0.189) total time=   4.4s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=sqrt, n_estimators=200;, score=(train=0.855, test=0.133) total time=   4.4s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, n_estimators=200;, score=(train=0.887, test=0.194) total time=   4.6s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, n_estimators=200;, score=(train=0.8

In [41]:
print("BloodOnly Training and Testing...")
stump = DecisionTreeRegressor(max_depth=1, random_state=42)
ada = AdaBoostRegressor(estimator=stump, random_state=42)
ada_grid = {"n_estimators": [3, 50],"learning_rate": [0.1, 0.3]}
ada_search = GridSearchCV(ada, ada_grid, scoring="r2", cv=cv, n_jobs=104, return_train_score=True, verbose=3)
ada_search.fit(X_train, y_train)
best_ada = ada_search.best_estimator_
pred_ada = best_ada.predict(X_test)
print("Ada best params", ada_search.best_params_)
print("Ada MAE", mean_absolute_error(y_test, pred_ada))
print("Ada RMSE", np.sqrt(mean_squared_error(y_test, pred_ada)))
print("Ada R2", r2_score(y_test, pred_ada))

BloodOnly Training and Testing...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 4/5] END learning_rate=0.1, n_estimators=3;, score=(train=0.136, test=0.056) total time=   0.9s
[CV 2/5] END learning_rate=0.1, n_estimators=3;, score=(train=0.142, test=-0.017) total time=   0.9s
[CV 4/5] END learning_rate=0.3, n_estimators=3;, score=(train=0.099, test=0.048) total time=   0.9s
[CV 5/5] END learning_rate=0.3, n_estimators=3;, score=(train=0.098, test=0.048) total time=   0.9s
[CV 3/5] END learning_rate=0.1, n_estimators=3;, score=(train=0.112, test=0.081) total time=   0.9s
[CV 5/5] END learning_rate=0.1, n_estimators=3;, score=(train=0.116, test=0.161) total time=   0.9s
[CV 1/5] END learning_rate=0.3, n_estimators=3;, score=(train=0.140, test=0.056) total time=   0.9s
[CV 3/5] END learning_rate=0.3, n_estimators=3;, score=(train=0.123, test=0.083) total time=   0.9s
[CV 2/5] END learning_rate=0.3, n_estimators=3;, score=(train=0.146, test=-0.036) total time=   0.9s
[CV 

Interpretation:

Both all gene models and Blood only regressors for predicting age do not perform amazingly, however, the age bins are quite broad which certainly affects time resolution for this task. Predicting within a decade is somewhat reasonable given the age bins, but do not perform that much better than the dummy model predicting the mean.