# Modeling Experiments: CatBoost optimization

# Setup

In [25]:
import pandas as pd
import numpy as np

In [26]:
# scikit-learn preprocessing
from sklearn.model_selection import train_test_split

# cross validation
from sklearn.model_selection import KFold

# metrics
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score

# Sk optimize
from bayes_opt import BayesianOptimization

# boosting libraries
from catboost import CatBoostClassifier, Pool, cv

In [27]:
# Random seed
seed = 18

# Data

In [28]:
all_mu_file = 'data/final_data.csv'
df = pd.read_csv(all_mu_file)
df.head()

Unnamed: 0,match_id,period_id,pla_id,plb_id,score_a,score_b,race_a,race_b,comp_rat_a,comp_rat_vp_a,...,pla_race_T,pla_race_Z,plb_race_P,plb_race_R,plb_race_T,plb_race_Z,pla_eff_rating,plb_eff_rating,ratings_diff,winner
0,204283,168,422,2102,3,1,Z,P,1.362724,1.472933,...,0,1,1,0,0,0,1.472933,1.023097,0.449836,1
1,204881,168,962,12662,1,0,Z,Z,-1000.0,0.067802,...,0,1,0,0,0,1,0.15574,0.169566,-0.013826,1
2,204373,168,61,4551,0,2,Z,Z,-1000.0,-1000.0,...,0,1,0,0,0,1,0.12187,0.525932,-0.404062,0
3,24883,2,208,1218,4,2,Z,P,-1000.0,0.376888,...,0,1,1,0,0,0,0.376888,-0.391791,0.76868,1
4,205013,168,1100,10298,2,1,T,Z,0.97538,1.661578,...,1,0,0,0,0,1,0.49868,-2000.0,2000.49868,1


In [29]:
df.columns

Index(['match_id', 'period_id', 'pla_id', 'plb_id', 'score_a', 'score_b',
       'race_a', 'race_b', 'comp_rat_a', 'comp_rat_vp_a', 'comp_rat_vt_a',
       'comp_rat_vz_a', 'position_a', 'position_vp_a', 'position_vt_a',
       'position_vz_a', 'comp_rat_b', 'comp_rat_vp_b', 'comp_rat_vt_b',
       'comp_rat_vz_b', 'position_b', 'position_vp_b', 'position_vt_b',
       'position_vz_b', 'pla_race_P', 'pla_race_R', 'pla_race_T', 'pla_race_Z',
       'plb_race_P', 'plb_race_R', 'plb_race_T', 'plb_race_Z',
       'pla_eff_rating', 'plb_eff_rating', 'ratings_diff', 'winner'],
      dtype='object')

In [30]:
# train/test split
# All features - races, all comp ratings, all positions
train_cols = list(df.columns[8:-1])
labels = df['winner'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(
    df[train_cols].to_numpy(),
    labels,
    test_size=0.2,
    train_size=0.8,
    shuffle=False
)

# create validation set

# Model optimization

In [31]:
kfold = KFold(n_splits=5, shuffle=False)
for train_index, test_index in kfold.split(X_train, y_train):
    print(train_index, test_index)

[ 58043  58044  58045 ... 290209 290210 290211] [    0     1     2 ... 58040 58041 58042]
[     0      1      2 ... 290209 290210 290211] [ 58043  58044  58045 ... 116083 116084 116085]
[     0      1      2 ... 290209 290210 290211] [116086 116087 116088 ... 174125 174126 174127]
[     0      1      2 ... 290209 290210 290211] [174128 174129 174130 ... 232167 232168 232169]
[     0      1      2 ... 232167 232168 232169] [232170 232171 232172 ... 290209 290210 290211]


In [32]:
def cb_opt(n_estimators, depth, learning_rate, max_bin,
            subsample, num_leaves, l2_leaf_reg, model_size_reg):
    scores = list()
    kfold = KFold(n_splits=5, shuffle=False)

    for train_index, test_index in kfold.split(X_train, y_train):
        trainx, valx = X_train[train_index], X_train[test_index]
        trainy, valy = y_train[train_index], y_train[test_index]
    
        reg = CatBoostClassifier(verbose = 0,
                                n_estimators = int(n_estimators),
                                learning_rate = learning_rate,
                                subsample = subsample,
                                l2_leaf_reg = l2_leaf_reg,
                                max_depth = int(depth),
                                num_leaves = int(num_leaves),
                                random_state = seed,
                                grow_policy = 'Lossguide',
                                max_bin = int(max_bin),
                                use_best_model = True,
                                model_size_reg = model_size_reg
                                )

        reg.fit(trainx, trainy, eval_set=(valx, valy))
        scores.append(matthews_corrcoef(valy, reg.predict(valx)))
    
    return np.mean(scores)

In [33]:
pbounds = {"n_estimators": (150,400),
           "depth": (2,7),
           "learning_rate": (.01, 0.3),
           "subsample":(0.6, 1.),
           "num_leaves": (16,40),
           "max_bin":(150,300),
           "l2_leaf_reg":(0,10),
           "model_size_reg": (0,10)
}

optimizer = BayesianOptimization(
    f = cb_opt,
    pbounds = pbounds,
    verbose = 2,
    random_state = seed
)

In [34]:
optimizer.maximize(init_points=2, n_iter=20)

|   iter    |  target   |   depth   | l2_lea... | learni... |  max_bin  | model_... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8036  [0m | [0m 5.252   [0m | [0m 5.055   [0m | [0m 0.2648  [0m | [0m 177.3   [0m | [0m 8.522   [0m | [0m 337.5   [0m | [0m 31.99   [0m | [0m 0.9952  [0m |
| [0m 2       [0m | [0m 0.8016  [0m | [0m 3.285   [0m | [0m 0.2831  [0m | [0m 0.1944  [0m | [0m 277.1   [0m | [0m 7.362   [0m | [0m 155.2   [0m | [0m 18.68   [0m | [0m 0.7191  [0m |
| [95m 3       [0m | [95m 0.8051  [0m | [95m 6.199   [0m | [95m 5.789   [0m | [95m 0.1311  [0m | [95m 175.7   [0m | [95m 9.279   [0m | [95m 341.7   [0m | [95m 32.0    [0m | [95m 0.7917  [0m |
| [0m 4       [0m | [0m 0.8026  [0m | [0m 6.475   [0m | [0m 8.372   [0m | [0m 0.2961  [0m | [0m 177.5   [0m | [0m 9.995   [0m 

KeyboardInterrupt: 

In [11]:
print(optimizer.max)

{'target': 0.8064935037537369, 'params': {'depth': 5.657733006971598, 'l2_leaf_reg': 4.925480593319444, 'learning_rate': 0.06664398685001582, 'max_bin': 169.84668755669836, 'model_size_reg': 8.028688263356198, 'n_estimators': 361.7347928337158, 'num_leaves': 39.37388866843141, 'subsample': 0.7902903426213559}}


In [12]:
for i, res in enumerate(optimizer.res):
    print(f"Iteration {i}: \n\t{res}")

Iteration 0: 
	{'target': 0.8035439571967962, 'params': {'depth': 5.251871208697958, 'l2_leaf_reg': 5.054533737348429, 'learning_rate': 0.26479442656725005, 'max_bin': 177.27603380573015, 'model_size_reg': 8.52233068674597, 'n_estimators': 337.53407152650266, 'num_leaves': 31.986440019647745, 'subsample': 0.9951581793103849}}
Iteration 1: 
	{'target': 0.8020683949805996, 'params': {'depth': 3.2848421130191507, 'l2_leaf_reg': 0.2830592527249809, 'learning_rate': 0.19435854348369047, 'max_bin': 277.0968581229939, 'model_size_reg': 7.361746251160107, 'n_estimators': 155.2017779808726, 'num_leaves': 18.678475135960788, 'subsample': 0.7190894967423439}}
Iteration 2: 
	{'target': 0.8052424013607087, 'params': {'depth': 6.199175205006918, 'l2_leaf_reg': 5.789489952478734, 'learning_rate': 0.13114323992843208, 'max_bin': 175.72614904357312, 'model_size_reg': 9.278530562987402, 'n_estimators': 341.68146052066163, 'num_leaves': 31.997587460301617, 'subsample': 0.791718290983581}}
Iteration 3: 
	

In [13]:
# train on best params
best_params = optimizer.max['params']
# save model

In [14]:
best_params

{'depth': 5.657733006971598,
 'l2_leaf_reg': 4.925480593319444,
 'learning_rate': 0.06664398685001582,
 'max_bin': 169.84668755669836,
 'model_size_reg': 8.028688263356198,
 'n_estimators': 361.7347928337158,
 'num_leaves': 39.37388866843141,
 'subsample': 0.7902903426213559}

In [15]:
tuned_model = CatBoostClassifier(task_type='GPU',
                                    depth=6,
                                    l2_leaf_reg=4.925480593319444,
                                    learning_rate = 0.06664398685001582,
                                    max_bin=int(170),
                                    n_estimators=int(362),
                                    num_leaves=int(40),
                                    subsample=0.7902903426213559,
                                    grow_policy='Lossguide',
                                    model_size_reg=8.028688263356198,
                                    random_state=seed,
                                    bootstrap_type='Bernoulli')

tuned_model.fit(X_train, y_train)

0:	learn: 0.6089337	total: 13.2ms	remaining: 4.75s
1:	learn: 0.5654776	total: 24.3ms	remaining: 4.38s
2:	learn: 0.5259378	total: 34.6ms	remaining: 4.14s
3:	learn: 0.4997508	total: 44.7ms	remaining: 4s
4:	learn: 0.4803894	total: 55.7ms	remaining: 3.98s
5:	learn: 0.4554304	total: 65.8ms	remaining: 3.9s
6:	learn: 0.4365190	total: 76.3ms	remaining: 3.87s
7:	learn: 0.4198127	total: 86.9ms	remaining: 3.84s
8:	learn: 0.3989375	total: 97ms	remaining: 3.8s
9:	learn: 0.3863608	total: 107ms	remaining: 3.78s
10:	learn: 0.3749061	total: 118ms	remaining: 3.75s
11:	learn: 0.3631377	total: 128ms	remaining: 3.74s
12:	learn: 0.3531742	total: 139ms	remaining: 3.72s
13:	learn: 0.3375717	total: 149ms	remaining: 3.7s
14:	learn: 0.3234852	total: 160ms	remaining: 3.69s
15:	learn: 0.3131016	total: 170ms	remaining: 3.68s
16:	learn: 0.3033862	total: 180ms	remaining: 3.64s
17:	learn: 0.2978418	total: 190ms	remaining: 3.63s
18:	learn: 0.2941450	total: 201ms	remaining: 3.63s
19:	learn: 0.2874990	total: 211ms	remain

<catboost.core.CatBoostClassifier at 0x7fc2982985d0>

In [16]:
preds = tuned_model.predict(X_test)

accuracy_score(preds, y_test)

0.9062215728974281

In [17]:
# save model
model_fname = 'models/cb_match_predictor.cbm'

tuned_model.save_model(model_fname,
                        format='cbm')