# Use GridSearchCV for hyperparameter tuning

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, make_scorer

In [2]:
training_data = pd.read_csv('../Datasets/training_data_full.csv')

In [3]:
non_numerical_features = set([
    'StateAbbr',
    'StateDesc',
    'CityName',
    'CountyFIPS',
    'TractFIPS'
])

target_features = set([
    'diabetes_change_rate_2020'
])

numerical_features = list(
    set(training_data.columns) - non_numerical_features - target_features
)

In [4]:
features = training_data[numerical_features]
labels = training_data['diabetes_change_rate_2020']

In [5]:
num_features = len(features)

In [6]:
parameters = {
    'max_depth': [1, 3, 5, 7, 10, 15, 20],
    'n_estimators': [50, 100, 200, 500, 750, 1000],
    'max_features': ['sqrt', 'log2', 1, 3, 5, 10]
}

In [7]:
rf = RandomForestRegressor(random_state = 42)

In [8]:
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=parameters,
    cv=5,
    n_jobs=16,
)

In [9]:
grid_search.fit(features, labels)



In [10]:
grid_search.best_params_

{'max_depth': 20, 'max_features': 1, 'n_estimators': 1000}

In [11]:
grid_search.best_score_

0.07597504855698096

# Try again with much larger values

In [None]:
parameters = {
    'max_depth': [18, 20, 30, 40],
    'n_estimators': [800, 1000, 1200, 1400],
    'max_features': [1] # previously determined 1 was best
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=parameters,
    cv=5,
    n_jobs=16,
)

grid_search.fit(features, labels)

