In [55]:
# General libraries
import pandas as pd
import numpy as np

# Scikit Learn libraries
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

# Scipy libraries
from scipy import stats
import joblib
import warnings
warnings.filterwarnings('ignore')

# Utils functions
from utils.utils import kfold, five_two, read_datasets

# Load Dataset

In [2]:
x_train, x_test, y_train, y_test = read_datasets(
    'x_train.csv',
    'x_test.csv',
    'y_train.csv',
    'y_test.csv'
)

# Gradient Boosting regression - XGBoost

In [3]:
# The cross validation scheme to be used for train and test
folds = kfold()

## GridSearch

In [4]:
hyper_params = {
    'colsample_bytree': [0.1, 0.2, 0.3],
    'n_estimators': [10, 20, 25, 50], #75, 100
    'subsample': [0.5, 0.6, 0.7],
    'max_depth': [1, 2, 3],
    'booster': ['gbtree', 'dart'],
    'objective': ["reg:squarederror"],
    'eta': [0.1, 0.2, 0.3, 0.4, 0.9],
    'gamma': [0, 0.3, 0.5],
    'min_child_weight': [2, 3, 4],
    'max_delta_step':[0, 1],
    'reg_lambda': [0.3, 0.5, 0.7],
    'alpha': [0.2, 0.3, 0.4],
    'tree_method': ['approx', 'auto', 'exact', 'gpu_hist', 'hist'],
    'max_leaves': [0, 1, 3, 5, 8],
    'eval_metric': ['rmse'],
    'base_score': [0.1, 0.3, 0.4, 0.5],
    'grow_policy': ['depthwise'],
    'refresh_leaf': [0,1],
    'sampling_method': ['uniform'],
    'colsample_bylevel': [0.1, 0.2, 0.5, 1],
    'colsample_bynode': [0.1, 0.2, 0.5, 1],
    'colsample_bytree': [0.1, 0.2, 0.4, 1],
    'updater': ['grow_colmaker', 'grow_histmaker', 'grow_quantile_histmaker', 'grow_gpu_hist', 'sync', 'refresh', 'prune']
}




# Call RandomizedSearchCV()
model_cv = RandomizedSearchCV(
    estimator = xgb.XGBRegressor(),
    param_distributions = hyper_params,
    n_iter=15000,
    scoring= 'r2',
    cv = folds,
    verbose = 2,
    return_train_score=True,
    n_jobs = -1,
    refit = True
    )


# Fit the model
best_model = model_cv.fit(x_train, np.ravel(y_train)) 

print(model_cv.best_params_)

Fitting 10 folds for each of 15000 candidates, totalling 150000 fits
{'updater': 'grow_histmaker', 'tree_method': 'exact', 'subsample': 0.7, 'sampling_method': 'uniform', 'reg_lambda': 0.3, 'refresh_leaf': 0, 'objective': 'reg:squarederror', 'n_estimators': 50, 'min_child_weight': 3, 'max_leaves': 8, 'max_depth': 1, 'max_delta_step': 0, 'grow_policy': 'depthwise', 'gamma': 0.5, 'eval_metric': 'rmse', 'eta': 0.2, 'colsample_bytree': 1, 'colsample_bynode': 0.5, 'colsample_bylevel': 0.5, 'booster': 'gbtree', 'base_score': 0.5, 'alpha': 0.4}


In [26]:
# Create new model with best_params_ from grid search

xgboost_best = best_model.best_estimator_

In [56]:
# Get the results for each split

def get_best_model_cv_split_results(best_model, n_splits=10, set_type='train'):
    results = []
    for i in range(0, n_splits):
        current_split = 'split{}_{}_score'.format(i, set_type)
        split_result = best_model.cv_results_[current_split][best_index]
        results.append(split_result)

    return results

print("Train Results: {}".format(get_best_model_cv_split_results(best_model, 10, 'train')))
print("Test Results: {}".format(get_best_model_cv_split_results(best_model, 10, 'test')))

Train Results: [0.3713780621407403, 0.36798204348901953, 0.35423043336248083, 0.35625547914772504, 0.37367218682338166, 0.3996905969800387, 0.3726975372339837, 0.3766437718662281, 0.3738449039230266, 0.3586751501868487]
Test Results: [0.23531106489913378, 0.17146703312728806, 0.2995418880574914, 0.2906989877096088, 0.24810060401652845, -0.12393422762110728, 0.17751238491874788, 0.24623741719377834, 0.13510803796147353, 0.3306688216210407]


In [62]:
#Get the mean for the train and test

train_mean = sum(get_best_model_cv_split_results(best_model, 10, 'train'))/10
test_mean = sum(get_best_model_cv_split_results(best_model, 10, 'test'))/10

print("Train mean: {}".format(train_mean))
print("Test mean: {}".format(test_mean))

Train mean: 0.3705070165153473
Test mean: 0.20107120118839833


In [31]:
r2 = r2_score(y_test, xgboost_best.predict(x_test))
print("The r2 score on test set: {:.4f}".format(r2))

The r2 score on test set: 0.2677


# Saving trained model

In [32]:
filename = '../models/xgboost_model.joblib'
joblib.dump(xgboost_best, filename)

['../models/xgboost_model.joblib']

## Notes

* Create 10 folds
* Set grid for the RandomSearchCV
* Search and fit the model with best params
* Get the results (r2) for each train and test results for training data
* Get the result (r2) for test data