# RandomForestRegressor Grid Search CV

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%run -i ./Model_Eval.ipynb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score

In [2]:
training_data = pd.read_csv('../Datasets/training_data_full.csv').sample(frac=1)

In [3]:
# import from Model_Eval.ipynb
numerical_features = get_numerical_features(training_data)

X = training_data[numerical_features]
Y = training_data['DIABETES_3Y_Change_Percentage']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [4]:
parameters = {
    'max_depth': [1, 5, 20, 50, 100],
    'n_estimators': [100, 500, 1000, 1500],
    'max_features': ['sqrt', 'log2', 1, 3, 5, 10]
}

In [5]:
rf = RandomForestRegressor(random_state = 42)

In [6]:
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=parameters,
    cv=5,
    n_jobs=16,
)

In [7]:
grid_search.fit(X, Y)



In [8]:
best_params = grid_search.best_params_
best_params

{'max_depth': 100, 'max_features': 'sqrt', 'n_estimators': 1000}

# Get Cross Val R2 scores

In [None]:
rf = RandomForestRegressor(
    max_depth=best_params['max_depth'],
    max_features=best_params['max_features'],
    n_estimators=best_params['n_estimators']
)

r2_scores = cross_val_score(rf, X, Y, cv=5, scoring='r2')

In [10]:
np.mean(r2_scores)

0.24315108058540505