# Import Packages

In [15]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import KFold

# metrics
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score

# train and store modeling metrics functions
from modeling_functions.train_model_funcs import train_funcs

In [16]:
# Random state seed
seed = 18

# Import data

In [17]:
train_df = pd.read_csv('data/train.csv', index_col=False)
test_df = pd.read_csv('data/test.csv')

train_df.head()

Unnamed: 0,match_id,period_id,pla_id,plb_id,score_a,score_b,race_a,race_b,comp_rat_a,comp_rat_vp_a,...,pla_race_P,pla_race_T,pla_race_Z,plb_race_P,plb_race_T,plb_race_Z,pla_eff_rating,plb_eff_rating,ratings_diff,winner
0,204283,168,422,2102,3,1,Z,P,1.362724,1.472933,...,0,0,1,1,0,0,1.472933,1.023097,0.449836,1
1,204881,168,962,12662,1,0,Z,Z,-1000.0,0.067802,...,0,0,1,0,0,1,0.15574,0.169566,-0.013826,1
2,204373,168,61,4551,0,2,Z,Z,-1000.0,-1000.0,...,0,0,1,0,0,1,0.12187,0.525932,-0.404062,0
3,24883,2,208,1218,4,2,Z,P,-1000.0,0.376888,...,0,0,1,1,0,0,0.376888,-0.391791,0.76868,1
4,205013,168,1100,10298,2,1,T,Z,0.97538,1.661578,...,0,1,0,0,0,1,0.49868,-0.364031,0.862712,1


In [18]:
test_df.head()

Unnamed: 0,match_id,period_id,pla_id,plb_id,score_a,score_b,race_a,race_b,comp_rat_a,comp_rat_vp_a,...,pla_race_P,pla_race_T,pla_race_Z,plb_race_P,plb_race_T,plb_race_Z,pla_eff_rating,plb_eff_rating,ratings_diff,winner
0,143772,88,4814,184,0,1,Z,T,0.048085,0.135338,...,0,0,1,0,1,0,0.303145,-2000.0,2000.303145,0
1,67383,88,3166,59,0,2,P,Z,-1000.0,-0.621009,...,1,0,0,0,0,1,-0.254441,-2000.0,1999.745559,0
2,142146,88,9531,9518,2,0,T,T,-1000.0,-0.477019,...,0,1,0,0,1,0,-0.058596,-3000.0,2999.941404,1
3,67190,88,4566,590,0,1,P,P,-1000.0,-3000.0,...,1,0,0,1,0,0,-3000.0,0.357552,-3000.357552,0
4,86874,88,1148,6104,2,0,Z,T,-1000.0,0.059067,...,0,0,1,0,1,0,-2000.0,-3000.0,1000.0,1


## Preprocess data

In [19]:
train_df.columns[-4:-1]

Index(['pla_eff_rating', 'plb_eff_rating', 'ratings_diff'], dtype='object')

In [20]:
feature_cols = list(train_df.columns[-4:-1])

x_train, x_test = train_df[feature_cols].to_numpy(),\
                            test_df[feature_cols].to_numpy()

y_train, y_test = train_df['winner'].to_numpy(),\
                            test_df['winner'].to_numpy()

In [21]:
x_train.shape, y_train.shape

((285422, 3), (285422,))

In [22]:
x_test.shape, y_test.shape

((71355, 3), (71355,))

# Modeling

## Cross validation

In [23]:
n_folds = 5
kfold = KFold(n_splits=n_folds)

In [24]:
cb_clf = CatBoostClassifier(task_type='GPU',
                                random_seed=seed,
                                verbose=False)

In [25]:
cb_train, cb_val = train_funcs.train_and_measure(
    cb_clf,
    kfold,
    'catboost - no tuning, limited features',
    x_train,
    y_train,
)

In [28]:
cb_test_preds = cb_clf.predict(x_test)

cb_test_metrics = {
    'catboost - no tuning': {
    'acc': accuracy_score(cb_test_preds, y_test),
    'f1': f1_score(cb_test_preds, y_test),
    'roc': roc_auc_score(cb_test_preds, y_test),
    'mcc': matthews_corrcoef(cb_test_preds, y_test)
    }
}

In [26]:
cb_train

{'catboost - no tuning, limited features': {'acc': 0.9008214155122444,
  'roc': 0.8982343416988842,
  'f1': 0.9173411909020392,
  'mcc': 0.793524988417819}}

In [27]:
cb_val

{'catboost - no tuning, limited features': {'acc': 0.9001198287577713,
  'roc': 0.8973100298137204,
  'f1': 0.9167090507477547,
  'mcc': 0.7918620083228379}}

In [29]:
cb_test_metrics

{'catboost - no tuning': {'acc': 0.898871837993133,
  'f1': 0.917785120200524,
  'roc': 0.8937976371633682,
  'mcc': 0.786459306435051}}