### GXB Hypter-parameter Tuning

1. All variables, no resampling

2. All variables, resampling

3. 12 variables, no resampling

4. 12 variables, resampling



In [1]:
import pandas as pd
import numpy as np
import xgboost
from xgboost import XGBRegressor
# from sklearn.metrics import accuracy_score, confusion_matrix, auc
from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
import pickle
%matplotlib inline

In [2]:
train_preprocessed = pd.read_csv('data/train_preprocessed.csv')

In [3]:
# Randomly, split the data into test/training/validation sets
x = train_preprocessed.drop(['prop_booking_bool'], axis=1)
y = train_preprocessed['prop_booking_bool']

x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size=0.33, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(563547, 59) (277568, 59) (563547,) (277568,)


In [4]:
 # shape of the dataset 
print('Shape of training data inputs (IVs):', x_train.shape)
print('Shape of testing data inputs (IVs) :', x_test.shape) 

Shape of training data inputs (IVs): (563547, 59)
Shape of testing data inputs (IVs) : (277568, 59)


In [5]:
# First try, parameter tuning for XGB
# tune for MAX_DEPTH & MIN_CHILD_WEIGHT
objective = "rank:ndcg"
seed = 123
n_estimators = 100
learning_rate = 0.1
gamma = 0.1
subsample = 0.8
colsample_bytree = 0.8
reg_alpha = 1
reg_lambda = 1

args = {}
args['objective'] = objective
args['seed'] = seed
args['n_estimators'] = n_estimators
args['learning_rate'] = learning_rate
args['gamma'] = gamma
args['colsample_bytree'] = colsample_bytree
args['reg_alpha'] = reg_alpha
args['reg_lambda'] = reg_lambda

scores = []

cv_params = {'max_depth': [2,4,6,8],
             'min_child_weight': [1,3,5,7]
            }

gbm = GridSearchCV(xgboost.XGBRegressor(**args),
                    param_grid = cv_params,
                    scoring = "neg_mean_squared_error",
                    cv = 5,
                    verbose = True
)

gbm.fit(x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 15.7min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.1, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None,
                                    objective='rank:ndcg', random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                       

In [7]:
GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.1, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=np.nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             param_grid={'max_depth': [2, 4, 6, 8],
                         'min_child_weight': [1, 3, 5, 7]},
             scoring='neg_mean_squared_error', verbose=True)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.1, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=None, tree_method=None,
            

In [8]:
# print(gbm.cv_results_)
print("Best parameters %s" %gbm.best_params_)
print("Best score %s" %gbm.best_score_)

Best parameters {'max_depth': 2, 'min_child_weight': 1}
Best score -0.25


In [9]:
# Refine MAX_DEPTH & MIN_CHILD_WEIGHT with smaller grid of values, based on best performer from previous step
# MIN_CHILD_WEIGHT = min no. samples (if all samples have a weight 1) required to create a new node in the tree.
max_depth = gbm.best_params_['max_depth']
min_child_weight = gbm.best_params_['min_child_weight']
args['max_depth'] = max_depth
args['min_child_weight'] = min_child_weight
scores.append(gbm.best_score_)

cv_params = {'max_depth': [max_depth-1, max_depth, max_depth+1], 
             'min_child_weight': [min_child_weight-1, min_child_weight-0.5, min_child_weight, min_child_weight+0.5, min_child_weight+1]
            }

gbm = GridSearchCV(xgboost.XGBRegressor(**args),
                    param_grid = cv_params,
                    iid = False,
                    scoring = "neg_mean_squared_error",
                    cv = 5,
                    verbose = True
)

gbm.fit(x_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 18.8min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.1, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=2, min_child_weight=1,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None,
                                    objective='rank:ndcg', random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                             

In [11]:
 GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.1, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=8, min_child_weight=3,
                                    missing=np.nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             iid=False,
             param_grid={'max_depth': [7, 8, 9],
                         'min_child_weight': [2, 2.5, 3, 3.5, 4]},
             scoring='neg_mean_squared_error', verbose=True)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.1, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=8, min_child_weight=3,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=None, tree_method=None,
                  

In [12]:
# Set MAX_DEPTH & MIN_CHILD_WEIGHT
# tune for GAMMA (how fiercely the tree gets pruned. balance against lambda)
max_depth = gbm.best_params_['max_depth']
min_child_weight = gbm.best_params_['min_child_weight'] # overwritten to be conservative & avoid overfitting
args['max_depth'] = max_depth
args['min_child_weight'] = min_child_weight
scores.append(gbm.best_score_)

cv_params = {'gamma': [i/10.0 for i in range(0, 10, 2)]}

gbm = GridSearchCV(xgboost.XGBRegressor(**args),
                    param_grid = cv_params,
                    scoring = "neg_mean_squared_error",
                    cv = 5,
                    verbose = True
)

gbm.fit(x_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  4.6min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.1, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=1, min_child_weight=0,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None,
                                    objective='rank:ndcg', random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                             

In [13]:
 GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.1, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=np.nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             param_grid={'gamma': [0.0, 0.2, 0.4, 0.6, 0.8]},
             scoring='neg_mean_squared_error', verbose=True)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.1, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=None, tree_method=None,
                

In [14]:
# Set the GAMMA parameter and tune the SUBSAMPLE & COLSAMPLE_BYTREE parameters next
# (these control the sampling of the dataset that is done at each boosting round) SUBSAMPLE = fraction of rows. COLSAMPLE_BYTREE = fraction of cols
gamma = gbm.best_params_['gamma']
args['gamma'] = gamma
scores.append(gbm.best_score_)

cv_params = {'subsample': [i/10.0 for i in range(6,11)],
             'colsample_bytree': [i/10.0 for i in range(6,11)]
            }

gbm = GridSearchCV(xgboost.XGBRegressor(**args),
                    param_grid = cv_params,
                    scoring = "neg_mean_squared_error",
                    cv = 5,
                    verbose = True
)

gbm.fit(x_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 26.8min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=1, min_child_weight=0,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None,
                                    objective='rank:ndcg', random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                             

In [15]:
GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=np.nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             param_grid={'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
                         'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]},
             scoring='neg_mean_squared_error', verbose=True)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=None, tree_method=None,
                

In [16]:
# Retune SUBSAMPLE & COLSAMPLE_BYTREE with a smaller grid of values based on best values above
subsample = gbm.best_params_['subsample']
colsample_bytree = gbm.best_params_['colsample_bytree']
args['subsample'] = subsample
args['colsample_bytree'] = colsample_bytree
scores.append(gbm.best_score_)

cv_params = {'subsample': [i/100.0 for i in range(int((subsample-0.1)*100.0), min(int((subsample+0.1)*100),105) , 5)],
             'colsample_bytree': [i/100.0 for i in range(int((colsample_bytree-0.1)*100.0), min(int((subsample+0.1)*100),105), 5)]
            }

gbm = GridSearchCV(xgboost.XGBRegressor(**args),
                   
                    param_grid = cv_params,
                    iid = False,
                    scoring = "neg_mean_squared_error",
                    cv = 5,
                    verbose = True
)

gbm.fit(x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 16.5min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.6,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=1, min_child_weight=0,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None,
                                    objective='rank:ndcg', random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                             

In [20]:
GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.6,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=np.nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=0.8, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             iid=False,
             param_grid={'colsample_bytree': [0.5, 0.55, 0.6, 0.65, 0.7, 0.75,
                                              0.8, 0.85],
                         'subsample': [0.7, 0.75, 0.8, 0.85]},
             scoring='neg_mean_squared_error', verbose=True)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.6,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=0.8, tree_method=None,
                 

In [21]:
# Set the SUBSAMPLE & COLSAMPLE_BYTREE parameters 
# tune REG_ALPHA & REG_LAMBDA (regularisation params)
colsample_bytree = gbm.best_params_['colsample_bytree']
subsample = gbm.best_params_['subsample']
args['colsample_bytree'] = colsample_bytree
args['subsample'] = subsample
scores.append(gbm.best_score_)

cv_params = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100], 
             'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100]
            }

gbm = GridSearchCV(xgboost.XGBRegressor(**args),
                    param_grid = cv_params,
                    scoring = "neg_mean_squared_error",
                    cv = 5,
                    verbose = True
)

gbm.fit(x_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 34.8min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.5,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=1, min_child_weight=0,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None,
                                    objective='rank:ndcg', random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                             

In [22]:
 GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.6,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=np.nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=0.8, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             param_grid={'reg_alpha': [1e-05, 0.01, 0.1, 1, 100],
                         'reg_lambda': [1e-05, 0.01, 0.1, 1, 100]},
             scoring='neg_mean_squared_error', verbose=True)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.6,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=0.8, tree_method=None,
                 

In [23]:
# Retune REG_ALPHA & REG_LAMBDA with a smaller grid of values based on best values above
reg_alpha = gbm.best_params_['reg_alpha']
reg_lambda = gbm.best_params_['reg_lambda']
args['reg_alpha'] = reg_alpha
args['reg_lambda'] = reg_lambda
scores.append(gbm.best_score_)

cv_params = {'reg_lambda': [reg_alpha*0.2, reg_alpha*0.5, reg_alpha, reg_alpha*2, reg_alpha*5], 
             'reg_alpha': [reg_lambda*0.2, reg_lambda*0.5, reg_lambda, reg_lambda*2, reg_lambda*5]
            }

gbm = GridSearchCV(xgboost.XGBRegressor(**args),
                    param_grid = cv_params,
                    scoring = "neg_mean_squared_error",
                    cv = 5,
                    verbose = True
)

gbm.fit(x_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed: 26.0min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.5,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=1, min_child_weight=0,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None,
                                    objective='rank:ndcg', random_state=None,
                                    reg_alpha=0.01, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                          

In [24]:
GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.6,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=np.nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=0.8, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             param_grid={'reg_alpha': [0.2, 0.5, 1, 2, 5],
                         'reg_lambda': [0.2, 0.5, 1, 2, 5]},
             scoring='neg_mean_squared_error', verbose=True)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.6,
                                    gamma=0.0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=7, min_child_weight=2.5,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=1, reg_lambda=1,
                                    scale_pos_weight=None, seed=123,
                                    subsample=0.8, tree_method=None,
                 

In [25]:
# print(gbm.cv_results_)
print("Best parameters %s" %gbm.best_params_)
print("Best score %s" %gbm.best_score_)

Best parameters {'reg_alpha': 0.2, 'reg_lambda': 0.002}
Best score -0.25


In [26]:
# Set the regularisationparameters: REG_ALPHA & REG_LAMBDA 
# (then increase trees & reduce learning rate)
reg_alpha = gbm.best_params_['reg_alpha']
reg_lambda = gbm.best_params_['reg_lambda']
args['reg_alpha'] = reg_alpha
args['reg_lambda'] = reg_lambda
scores.append(gbm.best_score_)

In [40]:
print(args)
print(scores)

{'objective': 'rank:ndcg', 'seed': 123, 'n_estimators': 100, 'learning_rate': 0.1, 'gamma': 0.0, 'colsample_bytree': 0.5, 'reg_alpha': 0.2, 'reg_lambda': 0.002, 'max_depth': 3, 'min_child_weight': 0, 'subsample': 0.5, 'eta': 0.05, 'eval_metric': 'rmse'}
[-0.25, -0.25, -0.25, -0.25, -0.25, -0.09883558710896737, -0.25]


In [41]:
# Create XGBoost's DMatrix - for finding the best tree from CV & for final model
trainDMat = xgboost.DMatrix(data = x_train, label = y_train)
testDMat = xgboost.DMatrix(data = x_test, label = y_test)

In [42]:
# CV for finding best tree
# Lower the learning_rate & set a large num_boost_round to ensure convergence. 
# (If convergence is slow, retry with a slightly higher learning rate, i.e. weight for each new tree)
learning_rate = 0.05
args['eta'] = learning_rate

num_boost_round = 3000
early_stopping_rounds = 20

xgbCV = xgboost.cv(
    params = args, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    nfold = 5,
    metrics = {'rmse', 'logloss'},
    early_stopping_rounds = early_stopping_rounds,
    verbose_eval = True,
    seed = seed     
)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings bu

In [43]:
# Final XGBoost model (final booster object uses best tree from CV)
num_boost_round = len(xgbCV)
args['eval_metric'] = 'rmse'

xgbFinal = xgboost.train(
    params = args, 
    dtrain = trainDMat, 
    num_boost_round = num_boost_round,
    evals = [(trainDMat, 'train'), 
             (testDMat, 'eval')]
)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:0.50000	eval-rmse:0.50000


In [32]:
# Visualise feature importance
xgboost.plot_importance(xgbFinal)

ValueError: Booster.get_score() results in empty.  This maybe caused by having all trees as decision dumps.

##### Predict using the XGB model

In [None]:
# Calc train & test preds, as well as MSE and RMSE
xgbFinal_train_preds = xgbFinal.predict(trainDMat)
xgbFinal_test_preds = xgbFinal.predict(testDMat)

In [None]:
# Calc train & test preds, as well as MSE and RMSE
xgbFinal_train_preds = xgbFinal.predict(trainDMat)
xgbFinal_test_preds = xgbFinal.predict(testDMat)

In [None]:
 print("\nModel Report")
print("MSE Train : %f" % metrics.mean_squared_error(y_train, xgbFinal_train_preds))
print("MSE Test: %f" % metrics.mean_squared_error(y_test, xgbFinal_test_preds))
print("RMSE Train: %f" % metrics.mean_squared_error(y_train, xgbFinal_train_preds)**0.5)
print("RMSE Test: %f" % metrics.mean_squared_error(y_test, xgbFinal_test_preds)**0.5)

print("\nFrom Test Preds: quality score average %f & std dev %f" % (np.mean(xgbFinal_test_preds), np.std(xgbFinal_test_preds)))
# print("\nFrom Train set: quality score average %f & std dev %f" % (np.mean(y_train), np.std(y_train)))
# print("From Test set: quality score average %f & std dev %f" % (np.mean(y_test), np.std(y_test)))
print("From ALL data: quality score average %f & std dev %f" % (np.mean(y), np.std(y)))

##### Validate results with correlation matrix

In [None]:
 # Create df of TRAIN data & predictions
train_corr_df = survey_df_aug.loc[y_train.index][['Overal Mark 1', 'Overal Mark 2', 'Final mark']]
train_pred_corr_df = pd.concat([train_corr_df, pd.Series(xgbFinal_train_preds, name = 'Preds')], axis = 1).dropna()
# train_pred_corr_df.head(50)

In [None]:
# Calculate & visualise correlations between variables
# calculate the correlation matrix
train_pred_corr = train_pred_corr_df.corr()

# plot the heatmap
sns.heatmap(train_pred_corr, 
        xticklabels = train_pred_corr.columns,
        yticklabels = train_pred_corr.columns)

In [None]:
train_pred_corr

In [None]:
 # Create df of TEST data & predictions
test_corr_df = survey_df_aug.loc[y_test.index][['Overal Mark 1', 'Overal Mark 2', 'Final mark']]
test_pred_corr_df = pd.concat([test_corr_df, pd.Series(xgbFinal_test_preds, name = 'Preds')], axis = 1).dropna()
# test_pred_corr_df.head(50)

In [None]:
# visualise correlations between variables
# calculate the correlation matrix
test_pred_corr = test_pred_corr_df.corr()

# plot the heatmap
sns.heatmap(test_pred_corr, 
        xticklabels = test_pred_corr_df.columns,
        yticklabels = test_pred_corr_df.columns)

In [None]:
test_pred_corr

##### Save the model & results

In [None]:
# Import pickle, & save final model to wd
pickle.dump(xgbFinal, open("models/xgbFinal.pickle.dat", "wb"))