In [1]:
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
import sys
sys.path.append('../')
from helper import *

from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split

import matplotlib.pyplot as plt

runs = pd.read_pickle("../Data/main_1.df")

#FEATURES = ['win_odds', 'horse_race_count', 'place_odds', 'best_odds', 'best_going_record', 'best_horse_record']
FEATURES = ['horse_no', 'horse_rating', 'declared_weight', 'actual_weight', 'win_odds', 'draw', 'race_size', 
            'last_race_result', 'win_percent', 'avg_distance_time', 'going_type_record', 'horse_race_count', 
            'horse_record', 'surface_record', 'place_odds', 'weight_change_from_average', 'venue_change', 
            'venue_record', 'days_since_last_race', 'new_horse', 'best_odds', 'best_win_percent', 
            'best_going_record', 'best_horse_record', 'best_jockey_record', 'best_trainer_record', 
            'highest_actual_weight', 'lowest_actual_weight', 'start_speed', 'rode_before']

TARGET = "won"

In [2]:
### PARAM TUNING STEPS:
# find best booster type

# set learning rate and find optimal n_estimators

# Tune max_depth and min_child_weight

# Tune gamma

# Tune subsample and colsample_bytree

# Regularization params

# Lower learning rate and add more trees

In [3]:
def trainModel(model, grid, x, y, n_splits=4, n_jobs=4, verbose=2):
    
    cv = KFold(n_splits=n_splits, shuffle=True)
    
    gSearch = GridSearchCV(estimator=model, param_grid=grid, n_jobs=n_jobs, cv=cv, scoring="accuracy", verbose=verbose)
    
    gSearch.fit(x, y)
    
    print("Best Params: ", gSearch.best_params_)
    print("Score: {}".format(gSearch.best_score_))
    
    return gSearch

def testModel(search, x, y):
    print("Determining Test Accuracy...")
    preds = search.predict(x)
    score = accuracy_score(y, preds)
    print("{:.2f}% accuracy".format(score*100))

In [4]:
# Baseline accuracy - 0.9183606753676548
xgb_bl = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500
)
grid_bl = {
    "silent":[1]
}
#search_bl = trainModel(xgb_bl, grid_bl, runs[FEATURES], runs[TARGET])

In [5]:
# BOOSTER TYPE
xgb_1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500
)

grid_1 = {
    'booster':["gbtree", "gblinear"]
}
#search_1 = trainModel(xgb_1, grid_1, runs[FEATURES], runs[TARGET])
#best_booster=search_1.best_params_["booster"]
best_booster = 'gblinear'

In [6]:
# max_depth and min_child_weight
xgb_2_a = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster
)

grid_2_a = {
    'max_depth':range(1,11)
}
#search_2_a = trainModel(xgb_2_a, grid_2_a, runs[FEATURES], runs[TARGET], verbose=5)
#best_max_depth = search_2_a.best_params_["max_depth"]
best_max_depth = 1

In [7]:
# max_depth and min_child_weight
xgb_2_b = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster,
    max_depth = best_max_depth
)

grid_2_b = {
    'min_child_weight':range(1,11)
}
#search_2_b = trainModel(xgb_2_b, grid_2_b, runs[FEATURES], runs[TARGET], verbose=5)
#best_min_child_weight = search_2_b.best_params_["min_child_weight"]
best_min_child_weight = 1

In [8]:
# gamma
xgb_3 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
)

grid_3 = {
    'gamma':[i/1000. for i in range(0,10)]
}
search_3 = trainModel(xgb_3, grid_3, runs[FEATURES], runs[TARGET])
best_gamma = search_3.best_params_["gamma"]

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  6.6min
[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:  7.3min finished


Best Params:  {'gamma': 0.0}
Score: 0.9199214941081479


In [9]:
# subsample and colsample_bytree
xgb_4 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
)

grid_4 = {
    'subsample':[i/10.0 for i in range(0,4)],
    'colsample_bytree':[i/10.0 for i in range(0,4)],
}
search_4 = trainModel(xgb_4, grid_4, runs[FEATURES], runs[TARGET])
best_subsample = search_4.best_params_["subsample"]
best_colsample_bytree = search_4.best_params_["colsample_bytree"]

Fitting 4 folds for each of 16 candidates, totalling 64 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  6.9min
[Parallel(n_jobs=4)]: Done  64 out of  64 | elapsed: 12.2min finished


Best Params:  {'colsample_bytree': 0.0, 'subsample': 0.0}
Score: 0.9199340378626114


In [10]:
# Regularization params
xgb_5 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
)

grid_5 = {
    'reg_alpha':[1e-6,1e-5,1e-4,1e-3,1e-2]
}
search_5 = trainModel(xgb_5, grid_5, runs[FEATURES], runs[TARGET])
best_reg_alpha = search_5.best_params_["reg_alpha"]


Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:  4.0min finished


Best Params:  {'reg_alpha': 0.001}
Score: 0.9199466348518234


In [13]:
# AFTER THESE ARE FIGURED OUT DECREASE LEARNING RATE AND INCREASE n_est
# gblinear 1 1 0.0 0.0 0.0 0.001
print(best_booster, best_max_depth, best_min_child_weight, best_gamma, best_subsample, best_colsample_bytree, 
      best_reg_alpha)     
# Final model performance
bestModel = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    reg_alpha=best_reg_alpha
)

meanScore, stdScore = crossVal(runs, FEATURES, TARGET, bestModel)

gblinear 1 1 0.0 0.0 0.0 0.001
CV 1/4
CV 2/4
CV 3/4
CV 4/4
Mean score: 0.298 +/- 0.015


In [31]:
#find best n_est for lower learning rate
n_est = [1200,1400,1600,1800]
lr = 0.1

for i in range(len(n_est)):
    
    xgb_6 = XGBClassifier(
        learning_rate =lr,
        n_estimator = n_est[i],
        booster=best_booster,
        max_depth=best_max_depth,
        min_child_weight=best_min_child_weight,
        gamma=best_gamma,
        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha=best_reg_alpha
    )
    
    print("Learning Rate: {}, n_est: {}".format(lr, n_est[i]))
    meanScore, stdScore = crossVal(runs, FEATURES, TARGET, xgb_6)


Learning Rate: 0.1, n_est: 1200
CV 1/4
CV 2/4
CV 3/4
CV 4/4
Mean score: 0.298 +/- 0.011
Learning Rate: 0.1, n_est: 1400
CV 1/4
CV 2/4
CV 3/4
CV 4/4
Mean score: 0.298 +/- 0.008
Learning Rate: 0.1, n_est: 1600
CV 1/4
CV 2/4
CV 3/4
CV 4/4
Mean score: 0.298 +/- 0.005
Learning Rate: 0.1, n_est: 1800
CV 1/4
CV 2/4
CV 3/4
CV 4/4
Mean score: 0.298 +/- 0.010


In [None]:
bestModel = XGBClassifier(
    learning_rate =0.1,
    n_estimator = 750,
    booster='gblinear',
    max_depth=1,
    min_child_weight=1,
    gamma=0.0,
    subsample=0,
    colsample_bytree=0,
    reg_alpha=0.001
)

In [32]:
xgb_6

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.0, gamma=0.0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=None, max_depth=1,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimator=1800, n_estimators=100, n_jobs=0,
              num_parallel_tree=None, objective='binary:logistic',
              random_state=0, reg_alpha=0.001, reg_lambda=0, scale_pos_weight=1,
              subsample=0.0, tree_method=None, validate_parameters=False,
              verbosity=None)