# Import packages

In [81]:
import pandas as pd
import numpy as np

# Boosting models
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# cross validation
from sklearn.model_selection import KFold

# metrics
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score

# Sk optimize
from bayes_opt import BayesianOptimization
from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV

# train and store modeling metrics functions
from modeling_functions.train_model_funcs import train_funcs

## Notebook setup

In [82]:
# Random state
seed = 18

# modeling metrics storage
model_acc = dict()

# Data

In [83]:
train_df = pd.read_csv('data/train.csv', index_col=False)
test_df = pd.read_csv('data/test.csv')

In [84]:
train_df.head()

Unnamed: 0,match_id,period_id,pla_id,plb_id,score_a,score_b,race_a,race_b,comp_rat_a,comp_rat_vp_a,...,pla_race_P,pla_race_T,pla_race_Z,plb_race_P,plb_race_T,plb_race_Z,pla_eff_rating,plb_eff_rating,ratings_diff,winner
0,204283,168,422,2102,3,1,Z,P,1.362724,1.472933,...,0,0,1,1,0,0,1.472933,1.023097,0.449836,1
1,204881,168,962,12662,1,0,Z,Z,-1000.0,0.067802,...,0,0,1,0,0,1,0.15574,0.169566,-0.013826,1
2,204373,168,61,4551,0,2,Z,Z,-1000.0,-1000.0,...,0,0,1,0,0,1,0.12187,0.525932,-0.404062,0
3,24883,2,208,1218,4,2,Z,P,-1000.0,0.376888,...,0,0,1,1,0,0,0.376888,-0.391791,0.76868,1
4,205013,168,1100,10298,2,1,T,Z,0.97538,1.661578,...,0,1,0,0,0,1,0.49868,-0.364031,0.862712,1


In [85]:
test_df.head()

Unnamed: 0,match_id,period_id,pla_id,plb_id,score_a,score_b,race_a,race_b,comp_rat_a,comp_rat_vp_a,...,pla_race_P,pla_race_T,pla_race_Z,plb_race_P,plb_race_T,plb_race_Z,pla_eff_rating,plb_eff_rating,ratings_diff,winner
0,143772,88,4814,184,0,1,Z,T,0.048085,0.135338,...,0,0,1,0,1,0,0.303145,-2000.0,2000.303145,0
1,67383,88,3166,59,0,2,P,Z,-1000.0,-0.621009,...,1,0,0,0,0,1,-0.254441,-2000.0,1999.745559,0
2,142146,88,9531,9518,2,0,T,T,-1000.0,-0.477019,...,0,1,0,0,1,0,-0.058596,-3000.0,2999.941404,1
3,67190,88,4566,590,0,1,P,P,-1000.0,-3000.0,...,1,0,0,1,0,0,-3000.0,0.357552,-3000.357552,0
4,86874,88,1148,6104,2,0,Z,T,-1000.0,0.059067,...,0,0,1,0,1,0,-2000.0,-3000.0,1000.0,1


# Modeling

## Cross validation

In [86]:
n_folds = 5
kfold = KFold(n_splits=n_folds)

## Process data

In [105]:
feature_cols = list(train_df.columns[8:-1])
feature_cols
x_train, x_test = train_df[feature_cols].to_numpy(),\
                            test_df[feature_cols].to_numpy()

y_train, y_test = train_df['winner'].to_numpy(),\
                            test_df['winner'].to_numpy()

In [106]:
x_train.shape, y_train.shape

((285422, 25), (285422,))

In [107]:
x_test.shape, y_test.shape

((71355, 25), (71355,))

In [108]:
for train, val in kfold.split(x_train, y_train):
    print(train)
    print(val)

[ 57085  57086  57087 ... 285419 285420 285421]
[    0     1     2 ... 57082 57083 57084]
[     0      1      2 ... 285419 285420 285421]
[ 57085  57086  57087 ... 114167 114168 114169]
[     0      1      2 ... 285419 285420 285421]
[114170 114171 114172 ... 171251 171252 171253]
[     0      1      2 ... 285419 285420 285421]
[171254 171255 171256 ... 228335 228336 228337]
[     0      1      2 ... 228335 228336 228337]
[228338 228339 228340 ... 285419 285420 285421]


## XGBoost

In [110]:
from modeling_functions.train_model_funcs import train_funcs

In [91]:
xgb = XGBClassifier()
xgb_train, xgb_val = train_funcs.train_and_measure(
    xgb,
    kfold,
    'xgb - no tuning',
    x_train,
    y_train
)

KeyError: "None of [Int64Index([ 57085,  57086,  57087,  57088,  57089,  57090,  57091,  57092,\n             57093,  57094,\n            ...\n            285412, 285413, 285414, 285415, 285416, 285417, 285418, 285419,\n            285420, 285421],\n           dtype='int64', length=228337)] are in the [columns]"

In [22]:
xgb_train

{'xgb - no tuning': {'acc': 0.915626686376449,
  'roc': 0.9137079862495993,
  'f1': 0.9296773004193701,
  'mcc': 0.8243730442625037}}

In [23]:
xgb_val

{'xgb - no tuning': {'acc': 0.9066820477452291,
  'roc': 0.904186039292434,
  'f1': 0.9222011110553117,
  'mcc': 0.8055082372371942}}

In [24]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
xgb_test_preds = xgb.predict(x_test)

xgb_test_metrics = {
    'xgb - no tuning': {
    'acc': accuracy_score(xgb_test_preds, y_test),
    'f1': f1_score(xgb_test_preds, y_test),
    'roc': roc_auc_score(xgb_test_preds, y_test),
    'mcc': matthews_corrcoef(xgb_test_preds, y_test)
    }
}



  "because it will generate extra copies and increase " +


In [26]:
xgb_test_metrics

{'xgb - no tuning': {'acc': 0.9050942470744867,
  'f1': 0.9228473124159774,
  'roc': 0.9003859713922834,
  'mcc': 0.7995948749162255}}

## CatBoost

In [92]:
cb_clf = CatBoostClassifier(task_type='GPU',
                                random_seed=seed,
                                verbose=False)

In [93]:
cb_train, cb_val = train_funcs.train_and_measure(
    cb_clf,
    kfold,
    'catboost - no tuning',
    x_train,
    y_train,
)

KeyError: "None of [Int64Index([ 57085,  57086,  57087,  57088,  57089,  57090,  57091,  57092,\n             57093,  57094,\n            ...\n            285412, 285413, 285414, 285415, 285416, 285417, 285418, 285419,\n            285420, 285421],\n           dtype='int64', length=228337)] are in the [columns]"

In [29]:
cb_clf.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7efbb0fa0c90>

In [30]:
cb_test_preds = cb_clf.predict(x_test)

cb_test_metrics = {
    'catboost - no tuning': {
    'acc': accuracy_score(cb_test_preds, y_test),
    'f1': f1_score(cb_test_preds, y_test),
    'roc': roc_auc_score(cb_test_preds, y_test),
    'mcc': matthews_corrcoef(cb_test_preds, y_test)
    }
}

In [31]:
cb_train

{'catboost - no tuning': {'acc': 0.910626196183822,
  'roc': 0.9085965666185662,
  'f1': 0.9255545493652013,
  'mcc': 0.8139214860131094}}

In [32]:
cb_val

{'catboost - no tuning': {'acc': 0.9076455324704863,
  'roc': 0.9053042612280076,
  'f1': 0.9230496717985182,
  'mcc': 0.8075017511616537}}

In [33]:
cb_test_metrics

{'catboost - no tuning': {'acc': 0.9062434307336557,
  'f1': 0.9238907849829352,
  'roc': 0.9018822809166878,
  'mcc': 0.8018787237113214}}

## Catboost - optimized

In [34]:
def cb_opt(n_estimators, depth, learning_rate, max_bin,
            subsample, num_leaves, l2_leaf_reg, model_size_reg):
    scores = list()
    kfold = KFold(n_splits=5, shuffle=False)

    for train_index, test_index in kfold.split(x_train, y_train):
        trainx, valx = x_train[train_index], x_train[test_index]
        trainy, valy = y_train[train_index], y_train[test_index]
    
        reg = CatBoostClassifier(verbose = 0,
                                n_estimators = int(n_estimators),
                                learning_rate = learning_rate,
                                subsample = subsample,
                                l2_leaf_reg = l2_leaf_reg,
                                max_depth = int(depth),
                                num_leaves = int(num_leaves),
                                random_state = seed,
                                grow_policy = 'Lossguide',
                                max_bin = int(max_bin),
                                use_best_model = True,
                                model_size_reg = model_size_reg
                                )

        reg.fit(trainx, trainy, eval_set=(valx, valy))
        scores.append(matthews_corrcoef(valy, reg.predict(valx)))
    
    return np.mean(scores)

In [35]:
pbounds = {"n_estimators": (150,400),
           "depth": (2,7),
           "learning_rate": (.01, 0.1),
           "subsample":(0.2, 1.),
           "num_leaves": (14,50),
           "max_bin":(64,512),
           "l2_leaf_reg":(0,10),
           "model_size_reg": (0,5)
}

optimizer = BayesianOptimization(
    f = cb_opt,
    pbounds = pbounds,
    verbose = 2,
    random_state = seed
)

In [36]:
optimizer.maximize(init_points=2, n_iter=20)

|   iter    |  target   |   depth   | l2_lea... | learni... |  max_bin  | model_... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8072  [0m | [0m 5.252   [0m | [0m 5.055   [0m | [0m 0.08907 [0m | [0m 145.5   [0m | [0m 4.261   [0m | [0m 337.5   [0m | [0m 37.98   [0m | [0m 0.9903  [0m |
| [0m 2       [0m | [0m 0.8068  [0m | [0m 3.285   [0m | [0m 0.2831  [0m | [0m 0.06721 [0m | [0m 443.6   [0m | [0m 3.681   [0m | [0m 155.2   [0m | [0m 18.02   [0m | [0m 0.4382  [0m |
| [95m 3       [0m | [95m 0.8073  [0m | [95m 5.201   [0m | [95m 3.103   [0m | [95m 0.06806 [0m | [95m 146.2   [0m | [95m 0.1148  [0m | [95m 337.0   [0m | [95m 37.41   [0m | [95m 0.3856  [0m |
| [95m 4       [0m | [95m 0.8073  [0m | [95m 4.502   [0m | [95m 2.302   [0m | [95m 0.0402  [0m | [95m 153.9   [0m | [95m 2.167 

In [37]:
best_params = optimizer.max['params']
best_params

{'depth': 5.272558578698696,
 'l2_leaf_reg': 3.6618572816947603,
 'learning_rate': 0.06177940812291211,
 'max_bin': 208.80082955565277,
 'model_size_reg': 0.36214586380619807,
 'n_estimators': 266.2122657293653,
 'num_leaves': 49.796927455029135,
 'subsample': 0.7440526404351189}

In [38]:
print(optimizer.max)

{'target': 0.8076924121107613, 'params': {'depth': 5.272558578698696, 'l2_leaf_reg': 3.6618572816947603, 'learning_rate': 0.06177940812291211, 'max_bin': 208.80082955565277, 'model_size_reg': 0.36214586380619807, 'n_estimators': 266.2122657293653, 'num_leaves': 49.796927455029135, 'subsample': 0.7440526404351189}}


In [39]:
optimized_params = optimizer.max['params']

optimized_params['n_estimators'] = int(optimized_params['n_estimators'])
optimized_params['depth'] = int(optimized_params['depth'])
optimized_params['num_leaves'] = int(optimized_params['num_leaves'])
optimized_params['max_bin'] = int(optimized_params['max_bin'])

In [40]:
optimized_params

{'depth': 5,
 'l2_leaf_reg': 3.6618572816947603,
 'learning_rate': 0.06177940812291211,
 'max_bin': 208,
 'model_size_reg': 0.36214586380619807,
 'n_estimators': 266,
 'num_leaves': 49,
 'subsample': 0.7440526404351189}

In [41]:
cb_tuned = CatBoostClassifier(
    verbose=False,
    task_type='GPU',
    random_seed=seed,
    grow_policy='Lossguide',
    depth=5,
    l2_leaf_reg=3.6618572816947603,
    max_bin=208,
    n_estimators=266,
    num_leaves=49,
    model_size_reg=0.36214586380619807,
    learning_rate=0.06177940812291211,
    subsample=0.7440526404351189,
    bootstrap_type='Poisson'
)

cb_tuned.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7efbb0f03dd0>

In [42]:
train_preds, test_preds = cb_tuned.predict(x_train), cb_tuned.predict(x_test)

In [43]:
cb_tuned_train_metrics = {
    'cb_optimized': {
    'acc': accuracy_score(train_preds, y_train),
    'f1': f1_score(train_preds, y_train),
    'roc': roc_auc_score(train_preds, y_train),
    'mcc': matthews_corrcoef(train_preds, y_train)
    }
}

cb_tuned_test_metrics = {
    'cb_optimized': {
    'acc': accuracy_score(test_preds, y_test),
    'f1': f1_score(test_preds, y_test),
    'roc': roc_auc_score(test_preds, y_test),
    'mcc': matthews_corrcoef(test_preds, y_test)
    }
}

In [44]:
cb_tuned_train_metrics

{'cb_optimized': {'acc': 0.9107952435341354,
  'f1': 0.9256819276405085,
  'roc': 0.9087377711512847,
  'mcc': 0.8143028970382128}}

In [45]:
cb_tuned_test_metrics

{'cb_optimized': {'acc': 0.9061593441244482,
  'f1': 0.9238138582318808,
  'roc': 0.9017704859966043,
  'mcc': 0.8017119549305104}}

## CatBoost - save best model

Bayesian optimization didn't produce anything better, so regular cross validation while saving the best model seems like the best approach.

In [46]:
cb_base = CatBoostClassifier(task_type='GPU',
                                random_seed=seed,
                                verbose=False)
cb_base.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7efbb118d590>

In [47]:
train_preds, test_preds = cb_base.predict(x_train), cb_base.predict(x_test)

In [48]:
cb_base_train_metrics = {
    'cb_base': {
    'acc': accuracy_score(train_preds, y_train),
    'f1': f1_score(train_preds, y_train),
    'roc': roc_auc_score(train_preds, y_train),
    'mcc': matthews_corrcoef(train_preds, y_train)
    }
}

cb_base_test_metrics = {
    'cb_base': {
    'acc': accuracy_score(test_preds, y_test),
    'f1': f1_score(test_preds, y_test),
    'roc': roc_auc_score(test_preds, y_test),
    'mcc': matthews_corrcoef(test_preds, y_test)
    }
}

In [49]:
cb_base_train_metrics

{'cb_base': {'acc': 0.910129562542481,
  'f1': 0.9251487482965803,
  'roc': 0.9080959943400018,
  'mcc': 0.8129017001889713}}

In [50]:
cb_base_test_metrics

{'cb_base': {'acc': 0.9062434307336557,
  'f1': 0.9238907849829352,
  'roc': 0.9018822809166878,
  'mcc': 0.8018787237113214}}

In [51]:
cb_base.save_model(
    'models/match_predictor.cbm',
    format='cbm'
)

# Summary of results

In [52]:
# train metrics summary
train_metrics_df = pd.DataFrame()

In [53]:
# test metric summary
test_metrics_df = pd.DataFrame()

In [54]:
xgb_test_metrics

{'xgb - no tuning': {'acc': 0.9050942470744867,
  'f1': 0.9228473124159774,
  'roc': 0.9003859713922834,
  'mcc': 0.7995948749162255}}

In [55]:
model_list = ['xgb - no tune', 'cb - no tune', 'cb - optimized', 'cb - base']
train_metrics = [xgb_train, cb_train, cb_tuned_train_metrics, cb_base_train_metrics]
test_metrics = [xgb_test_metrics, cb_test_metrics, cb_tuned_test_metrics, cb_base_test_metrics]

In [56]:
for metrics in train_metrics:
    metrics_df=pd.DataFrame.from_dict(metrics, orient='index')
    train_metrics_df = pd.concat([train_metrics_df, metrics_df])

In [57]:
train_metrics_df

Unnamed: 0,acc,roc,f1,mcc
xgb - no tuning,0.915627,0.913708,0.929677,0.824373
catboost - no tuning,0.910626,0.908597,0.925555,0.813921
cb_optimized,0.910795,0.908738,0.925682,0.814303
cb_base,0.91013,0.908096,0.925149,0.812902


In [58]:
for metrics in test_metrics:
    metrics_df=pd.DataFrame.from_dict(metrics, orient='index')
    test_metrics_df = pd.concat([test_metrics_df, metrics_df])

In [59]:
test_metrics_df

Unnamed: 0,acc,f1,roc,mcc
xgb - no tuning,0.905094,0.922847,0.900386,0.799595
catboost - no tuning,0.906243,0.923891,0.901882,0.801879
cb_optimized,0.906159,0.923814,0.90177,0.801712
cb_base,0.906243,0.923891,0.901882,0.801879


In [60]:
# add suffixes to column names
train_metrics_df = train_metrics_df.add_suffix('_train').reset_index()
test_metrics_df = test_metrics_df.add_suffix('_test').reset_index()

In [61]:
train_metrics_df

Unnamed: 0,index,acc_train,roc_train,f1_train,mcc_train
0,xgb - no tuning,0.915627,0.913708,0.929677,0.824373
1,catboost - no tuning,0.910626,0.908597,0.925555,0.813921
2,cb_optimized,0.910795,0.908738,0.925682,0.814303
3,cb_base,0.91013,0.908096,0.925149,0.812902


In [62]:
summary_df = train_metrics_df.merge(test_metrics_df, on='index')
summary_df.rename(columns={'index':'model_name'}, inplace=True)

In [63]:
summary_df

Unnamed: 0,model_name,acc_train,roc_train,f1_train,mcc_train,acc_test,f1_test,roc_test,mcc_test
0,xgb - no tuning,0.915627,0.913708,0.929677,0.824373,0.905094,0.922847,0.900386,0.799595
1,catboost - no tuning,0.910626,0.908597,0.925555,0.813921,0.906243,0.923891,0.901882,0.801879
2,cb_optimized,0.910795,0.908738,0.925682,0.814303,0.906159,0.923814,0.90177,0.801712
3,cb_base,0.91013,0.908096,0.925149,0.812902,0.906243,0.923891,0.901882,0.801879


In [64]:
summary_df.to_csv('model_summaries/boosting_models.csv')