In [1]:
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
import sys
sys.path.append('../')
from helper import *

from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, GridSearchCV, train_test_split

import matplotlib.pyplot as plt

runs = pd.read_pickle("../Data/main_1.df")

#FEATURES = ['win_odds', 'horse_race_count', 'place_odds', 'best_odds', 'best_going_record', 'best_horse_record']
FEATURES = ['horse_no', 'horse_rating', 'declared_weight', 'actual_weight', 'win_odds', 'draw', 'race_size', 
            'last_race_result', 'win_percent', 'avg_distance_time', 'going_type_record', 'horse_race_count', 
            'horse_record', 'surface_record', 'place_odds', 'weight_change_from_average', 'venue_change', 
            'venue_record', 'days_since_last_race', 'new_horse', 'best_odds', 'best_win_percent', 
            'best_going_record', 'best_horse_record', 'best_jockey_record', 'best_trainer_record', 
            'highest_actual_weight', 'lowest_actual_weight', 'start_speed', 'rode_before']

TARGET = "won"

In [2]:
### PARAM TUNING STEPS:
# find best booster type

# set learning rate and find optimal n_estimators

# Tune max_depth and min_child_weight

# Tune gamma

# Tune subsample and colsample_bytree

# Regularization params

# Lower learning rate and add more trees

In [3]:
def trainModel(model, grid, x, y, n_splits=4, n_jobs=4, verbose=2):
    
    cv = KFold(n_splits=n_splits, shuffle=True)
    
    gSearch = GridSearchCV(estimator=model, param_grid=grid, n_jobs=n_jobs, cv=cv, scoring="accuracy", verbose=verbose)
    
    gSearch.fit(x, y)
    
    print("Best Params: ", gSearch.best_params_)
    print("Score: {}".format(gSearch.best_score_))
    
    return gSearch

def testModel(search, x, y):
    print("Determining Test Accuracy...")
    preds = search.predict(x)
    score = accuracy_score(y, preds)
    print("{:.2f}% accuracy".format(score*100))

In [4]:
# Baseline accuracy - 0.9183606753676548
xgb_bl = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500
)
grid_bl = {
    "silent":[1]
}
#search_bl = trainModel(xgb_bl, grid_bl, runs[FEATURES], runs[TARGET])

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.3min finished


Best Params:  {'silent': 1}
Score: 0.9183606753676548


In [5]:
# BOOSTER TYPE
xgb_1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500
)

grid_1 = {
    'booster':["gbtree", "gblinear"]
}
#search_1 = trainModel(xgb_1, grid_1, runs[FEATURES], runs[TARGET])
#best_booster=search_1.best_params_["booster"]
best_booster = 'gblinear'

Fitting 4 folds for each of 2 candidates, totalling 8 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of   8 | elapsed:  3.2min remaining:  1.1min
[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:  3.2min finished


Best Params:  {'booster': 'gblinear'}
Score: 0.9199718155215603


In [6]:
# max_depth and min_child_weight
xgb_2_a = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster
)

grid_2_a = {
    'max_depth':range(1,11)
}
search_2_a = trainModel(xgb_2_a, grid_2_a, runs[FEATURES], runs[TARGET], verbose=5)
best_max_depth = search_2_a.best_params_["max_depth"]

Fitting 4 folds for each of 16 candidates, totalling 64 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done  64 out of  64 | elapsed: 14.3min finished


Best Params:  {'max_depth': 1, 'min_child_weight': 1}
Score: 0.9199718009453792


In [None]:
# max_depth and min_child_weight
xgb_2_b = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster
    max_depth = 
)

grid_2_b = {
    'min_child_weight':range(1,11)
}
search_2_b = trainModel(xgb_2_b, grid_2_b, runs[FEATURES], runs[TARGET], verbose=5)
best_min_child_weight = search_2_b.best_params_["min_child_weight"]

In [13]:
# gamma
xgb_3 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
)

grid_3 = {
    'gamma':[i/100. for i in range(0,10)]
}
search_3 = trainModel(xgb_3, grid_3, runs[FEATURES], runs[TARGET])
best_gamma = search_3.best_params_["gamma"]

In [15]:
# subsample and colsample_bytree
xgb_4 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
)

grid_4 = {
    'subsample':[i/10.0 for i in range(2,8)],
    'colsample_bytree':[i/10.0 for i in range(2,8)],
}
#search_4 = trainModel(xgb_4, grid_4, runs[FEATURES], runs[TARGET])
best_subsample = search_4.best_params_["subsample"]
best_colsample_bytree = search_4.best_params_["colsample_bytree"]


In [16]:
# Regularization params
xgb_5 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=500,
    booster=best_booster,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
)

grid_5 = {
    'reg_alpha':[1e-10,1e-9,1e-8,1e-7,1e-6,1e-5,1e-4]
}
search_5 = trainModel(xgb_5, grid_5, runs[FEATURES], runs[TARGET])
best_reg_alpha = search_5.best_params_["reg_alpha"]


Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:  3.4min finished


Best Params:  {'reg_alpha': 1e-05}
Score: 0.9199718079165962


In [17]:
# AFTER THESE ARE FIGURED OUT DECREASE LEARNING RATE AND INCREASE n_est

print(best_booster, best_max_depth, best_min_child_weight, best_gamma, best_subsample, best_colsample_bytree, 
      best_reg_alpha)     




gblinear 1 1 0.0 0.6 0.6 1e-05
