# Use GridSearchCV for hyperparameter tuning

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, make_scorer

In [2]:
training_data = pd.read_csv('../Datasets/training_data_full.csv')

In [3]:
non_numerical_features = set([
    'StateAbbr',
    'StateDesc',
    'CityName',
    'CountyFIPS',
    'TractFIPS'
])

target_features = set([
    'DIABETES_3Y_Change_Percentage'
])

numerical_features = list(
    set(training_data.columns) - non_numerical_features - target_features
)

In [4]:
features = training_data[numerical_features]
labels = training_data['DIABETES_3Y_Change_Percentage']

In [5]:
num_features = len(features)

In [6]:
parameters = {
    'max_depth': [1, 5, 20, 50, 100],
    'n_estimators': [100, 500, 1000, 1500],
    'max_features': ['sqrt', 'log2', 1, 3, 5, 10]
}

In [7]:
rf = RandomForestRegressor(random_state = 42)

In [8]:
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=parameters,
    cv=5,
    n_jobs=8,
)

In [None]:
grid_search.fit(features, labels)



In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

# Fine tune, in the vicinity of the best parameters from previous grid search

In [None]:
rf = RandomForestRegressor(random_state = 42)

parameters = {
    'max_depth': [23, 25, 30, 35, 40],
    'n_estimators': [1000, 1200, 1400],
    'max_features': [1,2]
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=parameters,
    cv=5,
    n_jobs=8,
)

grid_search.fit(features, labels)

In [None]:
grid_search.best_params_