In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

In [3]:
my_data = pd.read_csv('/home/ubuntu/phenotype_and_GPS.csv', header=0)
my_data['KEY'] = my_data['KEY'].astype(str)

#### I placed NAN to mean value... should I replace it?

In [4]:
my_data.fillna(my_data.mean(), inplace=True)

In [5]:
my_data.head()

Unnamed: 0,KEY,AD,DEPRESSION_SUB,IQ,NEUROTICISM,WORRY_SUB,INSOMNIA,PTSD,SNORING,CP,...,income,married,abcd_site,vol,BMI,NIH_totcomp,NIH_flucomp,CBCL_TotProb,CBCL_Internal,CBCL_External
0,NDARINV003RTV85,-5.78016,0.002583,-2.9176,0.013433,-0.000821,-0.307129,0.0,0.063292,-2.156561,...,8.0,1.0,6,-1.050932,20.480539,0.632147,0.605564,-0.789382,-0.731957,-0.589077
1,NDARINV007W6H7B,9.133668,0.00206,0.369931,6.4e-05,-0.001469,0.43852,0.017525,0.021098,-3.683131,...,10.0,1.0,22,1.293196,18.234286,1.069812,0.887103,0.268149,0.534371,-0.418541
2,NDARINV00BD7VDC,2.05918,0.006631,-4.261953,0.012193,-0.016278,0.071778,0.010899,0.037388,-4.160142,...,10.0,1.0,7,1.129021,16.3298,0.632147,1.544026,-0.065808,-0.008341,-0.248005
3,NDARINV00J52GPG,8.591499,0.004259,-2.582771,0.013623,0.000487,-0.328521,0.019801,0.022991,-4.682147,...,6.0,1.522359,17,0.020277,19.061896,0.329044,0.237332,-0.399765,-0.912861,-0.589077
4,NDARINV00NPMHND,3.104598,0.000878,-3.145087,0.013695,-0.013129,0.014641,0.0,-0.008245,-4.843486,...,8.0,1.0,17,0.663349,17.663472,-0.680849,-1.365205,1.32568,3.247931,-0.248005


#### CONSTANTS:

In [6]:
k_fold = 10

num_samples = len(my_data)
each_fold_size = int(num_samples/k_fold) +1

print('num_samples: ', num_samples)
print('fold_size: ', each_fold_size)

num_samples:  4567
fold_size:  457


#### PARAMETERS:

In [7]:
X = my_data.iloc[:, 1:26]
X.head()

Unnamed: 0,AD,DEPRESSION_SUB,IQ,NEUROTICISM,WORRY_SUB,INSOMNIA,PTSD,SNORING,CP,excl23andMe,...,ASP,DRINK,RISK4PC,RISKTOL,SMOKER,CANNABIS,GENERALHAPPINESS,GENERALHAPPINESS_HEALTH,GENERALHAPPINESS_MEANINGFUL,HAPPINESS
0,-5.78016,0.002583,-2.9176,0.013433,-0.000821,-0.307129,0.0,0.063292,-2.156561,-0.248034,...,-0.583751,-0.106358,0.233276,-0.363054,0.022038,-5.988192,0.031204,0.748719,-0.001207,0.002318
1,9.133668,0.00206,0.369931,6.4e-05,-0.001469,0.43852,0.017525,0.021098,-3.683131,-1.858655,...,-0.544655,-0.161727,0.130802,-0.320289,-0.169713,-3.063278,0.019538,0.543616,-0.017617,-0.000128
2,2.05918,0.006631,-4.261953,0.012193,-0.016278,0.071778,0.010899,0.037388,-4.160142,-2.382114,...,-0.601716,-0.402164,-0.271698,0.390031,-0.08881,-10.187523,0.062408,1.125374,0.000814,-0.001354
3,8.591499,0.004259,-2.582771,0.013623,0.000487,-0.328521,0.019801,0.022991,-4.682147,-1.21285,...,-0.068964,-0.947712,-0.18395,-0.037689,-0.110845,-0.944047,0.031058,0.541087,-0.00315,0.001577
4,3.104598,0.000878,-3.145087,0.013695,-0.013129,0.014641,0.0,-0.008245,-4.843486,-1.088557,...,-1.211318,-0.997582,-0.07491,0.404019,-0.183223,-10.269615,0.0,0.427903,-0.01377,-0.000256


In [8]:
y = my_data['NIH_totcomp']
y.head()

0    0.632147
1    1.069812
2    0.632147
3    0.329044
4   -0.680849
Name: NIH_totcomp, dtype: float64

In [9]:
X = X.to_numpy()
y = y.to_numpy().ravel()

In [10]:
inner_cv = KFold(n_splits = 5, shuffle=False, random_state = 123)
outer_cv = KFold(n_splits = 10, shuffle=False, random_state = 123)

In [11]:
outerloop = 0

for train_index, test_index in outer_cv.split(X, y):
    outerloop += 1
    print('----------------------------')
    print('Loop: ', outerloop)
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    params = {'n_estimators': np.arange(100, 200, 10),
              'max_depth': np.arange(5, 30, 5)}
    
    rf = RandomForestRegressor()
    rf_grid = GridSearchCV(estimator = rf, param_grid = params, cv = inner_cv, scoring='r2', n_jobs=-1)
    
    rf_grid.fit(X_train, y_train)
    y_pred_test = rf_grid.predict(X_test)
    #y_pred_train = rf_grid.predict(X_train)
    
    
    print('best train parameter: ', rf_grid.best_params_)
    print('best train score: ', rf_grid.best_score_)
    print('\n')
    print('test score: ', r2_score(y_test, y_pred_test))
    print('----------------------------')

----------------------------
Loop:  1
best train parameter:  {'max_depth': 5, 'n_estimators': 180}
best train score:  0.05806873961429768


test score:  0.012148223080617915
----------------------------
----------------------------
Loop:  2
best train parameter:  {'max_depth': 5, 'n_estimators': 140}
best train score:  0.04943085353897436


test score:  0.07088049078143155
----------------------------
----------------------------
Loop:  3
best train parameter:  {'max_depth': 5, 'n_estimators': 130}
best train score:  0.04827127977764469


test score:  0.09319248255243484
----------------------------
----------------------------
Loop:  4
best train parameter:  {'max_depth': 5, 'n_estimators': 190}
best train score:  0.058543053320224955


test score:  0.03115169504143267
----------------------------
----------------------------
Loop:  5
best train parameter:  {'max_depth': 5, 'n_estimators': 100}
best train score:  0.05664955300089856


test score:  0.033588406476452226
----------------

##### score is R2 score