In [1]:
# General libraries
import pandas as pd
import numpy as np

# Scikit Learn libraries
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

# Scipy libraries
from scipy import stats
import joblib
import warnings
warnings.filterwarnings('ignore')

# Utils functions
from utils.utils import kfold, five_two, read_datasets

# Load Dataset

In [2]:
x_train, x_test, y_train, y_test = read_datasets(
    'x_train.csv',
    'x_test.csv',
    'y_train.csv',
    'y_test.csv'
)

# Gradient Boosting regression - XGBoost

In [3]:
# The cross validation scheme to be used for train and test
folds = kfold()

## GridSearch

In [4]:
# Specify range of hyperparameters to tune
hyper_params = {
    'colsample_bytree': [0.2, 0.3, 0.4],
    #'learning_rate': [0.15, 2],
    'n_estimators': [25, 50],
    'subsample': [0.5, 0.8, 1],
    'max_depth': [1, 2, 3, 6],
    'booster': ['gbtree'],
    'objective': ["reg:squarederror"],
    'eta': [0.3, 0.4, 0.5, 0.6],
    'gamma': [0, 1],
    'min_child_weight': [1, 3, 5],
    'max_delta_step':[0, 1, 5],
    'reg_lambda': [1],
    'alpha': [0],
    'tree_method': ['auto', 'exact'],
    'max_leaves': [0, 1, 5],
    'eval_metric': ['mae', 'rmse'],
    'base_score': [0.4, 0.5, 0.7],
    'grow_policy': ['depthwise']
}




# Call GridSearchCV()
model_cv = RandomizedSearchCV(
    estimator = xgb.XGBRegressor(),
    param_distributions = hyper_params,
    n_iter=200,
    scoring= 'r2',
    cv = folds,
    verbose = 2,
    return_train_score=True,
    n_jobs = -1,
    refit = True
    )


# Fit the model
best_model = model_cv.fit(x_train, np.ravel(y_train)) 

print(model_cv.best_params_)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
{'tree_method': 'auto', 'subsample': 0.5, 'reg_lambda': 1, 'objective': 'reg:squarederror', 'n_estimators': 50, 'min_child_weight': 3, 'max_leaves': 1, 'max_depth': 6, 'max_delta_step': 5, 'grow_policy': 'depthwise', 'gamma': 1, 'eval_metric': 'mae', 'eta': 0.4, 'colsample_bytree': 0.2, 'booster': 'gbtree', 'base_score': 0.4, 'alpha': 0}


In [5]:
# Create new model with best_params_ from grid search
# Use cross validation on the best_params_ model

xgboost_best = xgb.XGBRegressor(
    colsample_bytree=model_cv.best_params_['colsample_bytree'],
    n_estimators=model_cv.best_params_['n_estimators'],
    subsample=model_cv.best_params_['subsample'],
    max_depth=model_cv.best_params_['max_depth'],
    #learning_rate=model_cv.best_params_['learning_rate'],
    booster=model_cv.best_params_['booster'],
    objective=model_cv.best_params_['objective'],
    eta=model_cv.best_params_['eta'],
    gamma=model_cv.best_params_['gamma'],
    min_child_weight=model_cv.best_params_['min_child_weight'],
    max_delta_step=model_cv.best_params_['max_delta_step'],
    reg_lambda=model_cv.best_params_['reg_lambda'],
    alpha=model_cv.best_params_['alpha'],
    tree_method=model_cv.best_params_['tree_method'],
    max_leaves=model_cv.best_params_['max_leaves'],
    eval_metric=model_cv.best_params_['eval_metric'],
    base_score=model_cv.best_params_['base_score'],
    grow_policy=model_cv.best_params_['grow_policy']
    )


xgboost_best.fit(x_train, y_train)

XGBRegressor(alpha=0, base_score=0.4, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.2,
             early_stopping_rounds=None, enable_categorical=False, eta=0.4,
             eval_metric='mae', gamma=1, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.400000006, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=5, max_depth=6, max_leaves=1, min_child_weight=3,
             missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [6]:
r2 = r2_score(y_test, xgboost_best.predict(x_test))
print("The r2 score on test set: {:.4f}".format(r2))

The r2 score on test set: 0.2899


# Saving trained model

In [7]:
filename = '../models/xgboost_model.joblib'
joblib.dump(xgboost_best, filename)

['../models/xgboost_model.joblib']