# XG Boost Grid Search

In [20]:
import pandas as pd
import xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
%run -i ./Model_Eval.ipynb

In [2]:
training_data = pd.read_csv('../Datasets/training_data_full.csv')

In [3]:
# import from Model_Eval.ipynb
numerical_features = get_numerical_features(training_data)

In [4]:
features = training_data[numerical_features]
labels = training_data['DIABETES_3Y_Change_Percentage']

X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [5]:
xgb_model = XGBRegressor(use_rmm=True)

In [6]:
parameters = {
    'max_depth': [1,3,5,7,9],
    'eta': [0.1,0.3,0.5],
    'n_estimators': [50,200,400,1000]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=parameters,
    cv=5,
    n_jobs=8,
    verbose=2
)

In [7]:
grid_search.fit(features, labels)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [8]:
grid_search.best_params_

{'eta': 0.1, 'max_depth': 3, 'n_estimators': 50}

# Retry, bring down the min values for eta & n_estimators

In [11]:
xgb_model = XGBRegressor(use_rmm=True)

parameters = {
    'max_depth': [3,5,7],
    'eta': [0.005, 0.01, 0.05],
    'n_estimators': [40, 50, 60, 100]
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=parameters,
    cv=5,
    n_jobs=8,
    verbose=2
)

grid_search.fit(features, labels)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [16]:
print(grid_search.best_params_)

{'eta': 0.01, 'max_depth': 5, 'n_estimators': 60}


# Test out the best parameters

In [28]:
xgb_model = XGBRegressor(use_rmm=True, eta=0.01, max_depth=5, n_estimators=60)

In [33]:
scores = cross_val_score(xgb_model, features, labels, cv=10)

In [34]:
scores

array([-0.03481296, -0.02876599,  0.04498102,  0.05432874, -0.04195908,
        0.01617983, -0.27772661,  0.03264957, -0.00122172,  0.00454105])

In [45]:
xgb_model = XGBRegressor(use_rmm=True, eta=0.01, max_depth=5, n_estimators=150)
scores = cross_val_score(xgb_model, features, labels, cv=10, scoring='r2')

In [46]:
scores

array([-1.27888861e-01, -2.35672792e-02,  9.86327647e-02,  9.16118575e-02,
       -1.12987344e-01,  3.73820885e-02, -4.14530742e-01,  6.08494894e-02,
        3.69554380e-04, -4.90888057e-02])

In [None]:
xgb_model = XGBRegressor(use_rmm=True, eta=0.01, max_depth=5, n_estimators=1000)
xgb_model.fit(X_train, Y_train)
test_pred = xgb_model.predict(X_test)
r2_score(Y_test, test_pred)