## Finding optimal hyperparameters for XGBoost

In [17]:
import os
import sys
import numpy as np  
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import metrics   
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV

sys.path.append('..')

from Workflow import Workflow

In [18]:
wf = Workflow()
Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest = wf.load_data()

In [19]:
def optimize_xgb_model(X, y, param_grid, cv=5, scoring='r2', n_jobs=-1, random_state=None):
    xgb = XGBRegressor()
    grid_search = HalvingGridSearchCV(
        estimator=xgb, 
        param_grid=param_grid, 
        cv=cv, 
        scoring=scoring, 
        n_jobs=n_jobs, 
        verbose=1, 
        aggressive_elimination=True, 
        random_state=random_state 
    )
    grid_search.fit(X, y)
    print(f'Best parameters found: {grid_search.best_params_}')
    print(f'Best score: {grid_search.best_score_}')
    return grid_search.best_estimator_

In [20]:
# Define a parameter grid for hyperparameter tuning
param_grid = {
        'n_estimators': [300, 350, 400],
        'max_depth': [8, 10, 12],
        'learning_rate': [0.05, 0.01, 0.05],
        'min_child_weight': [5, 7, 9],
        'subsample': [0.6, 0.7, 0.8],
        'colsample_bytree': [0.6, 0.7, 0.8],
        'gamma': [0.2],
        'reg_alpha': [0, 0.5, 1],
        'reg_lambda': [1.5, 2, 2.5]
    }


# Use the optimize_xgb_model function to find the best model
seed = 42
best_model = optimize_xgb_model(Xtrain, ytrain, param_grid, random_state=seed)

# Evaluate the best model on the training set
y_train_pred = best_model.predict(Xtrain)
train_r2 = metrics.r2_score(ytrain, y_train_pred)

# Evaluate the best model on the validation set
y_valid_pred = best_model.predict(Xvalid)
validation_rmse = np.sqrt(metrics.mean_squared_error(yvalid, y_valid_pred))
validation_r2 = metrics.r2_score(yvalid, y_valid_pred)

# Evaluate the best model on the test set
y_test_pred = best_model.predict(Xtest)
test_r2 = metrics.r2_score(ytest, y_test_pred)

# Print the results
print("Training R2 Score:", train_r2)
print("Validation RMSE:", validation_rmse)
print("Validation R2 Score:", validation_r2)
print("Test R2 Score:", test_r2)

n_iterations: 9
n_required_iterations: 9
n_possible_iterations: 9
min_resources_: 12
max_resources_: 78854
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 6561
n_resources: 12
Fitting 5 folds for each of 6561 candidates, totalling 32805 fits


1 fits failed out of a total of 32805.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\tobia\anaconda3\envs\dsaie\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\tobia\anaconda3\envs\dsaie\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "c:\Users\tobia\anaconda3\envs\dsaie\lib\site-packages\xgboost\sklearn.py", line 1081, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\tobia\anaconda3\envs\dsaie\lib\site-packages\xgboost\sklearn.py", line 596, in _wrap_evaluation_matrices


----------
iter: 1
n_candidates: 2187
n_resources: 36
Fitting 5 folds for each of 2187 candidates, totalling 10935 fits


   -22.95043106   -19.97649109]
  0.88020156]


----------
iter: 2
n_candidates: 729
n_resources: 108
Fitting 5 folds for each of 729 candidates, totalling 3645 fits


  2.56138264e-01  2.56138264e-01]
  0.5960771 ]


----------
iter: 3
n_candidates: 243
n_resources: 324
Fitting 5 folds for each of 243 candidates, totalling 1215 fits


  4.39188609e-01  4.37322984e-01]
  0.84142755]


----------
iter: 4
n_candidates: 81
n_resources: 972
Fitting 5 folds for each of 81 candidates, totalling 405 fits


  5.73698410e-01  5.73795814e-01]
  0.84652089]


----------
iter: 5
n_candidates: 27
n_resources: 2916
Fitting 5 folds for each of 27 candidates, totalling 135 fits


  6.21683975e-01  6.21840422e-01]
  0.88165181]


----------
iter: 6
n_candidates: 9
n_resources: 8748
Fitting 5 folds for each of 9 candidates, totalling 45 fits


  6.49882845e-01  6.49579472e-01]
  0.86433875]


----------
iter: 7
n_candidates: 3
n_resources: 26244
Fitting 5 folds for each of 3 candidates, totalling 15 fits


  6.81594443e-01  6.81552347e-01]
  0.82366037]


----------
iter: 8
n_candidates: 1
n_resources: 78732
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  6.81552347e-01  7.22500394e-01]
  0.81426578]


Best parameters found: {'colsample_bytree': 0.6, 'gamma': 0.2, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 5, 'n_estimators': 400, 'reg_alpha': 0, 'reg_lambda': 2.5, 'subsample': 0.6}
Best score: 0.722500393727534
Training R2 Score: 0.8101913955918423
Validation RMSE: 53.888463069568736
Validation R2 Score: 0.6966556460722328
Test R2 Score: 0.7335783884186793
