# Import packages

In [1]:
import pandas as pd
import numpy as np

# sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# cross validation
from sklearn.model_selection import KFold

# metrics
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score

# model training function
from modeling_functions.train_model_funcs import train_funcs

# Setup

In [2]:
# Random state
seed = 18

# modeling metrics storage
model_acc = dict()

## Helper Functions

In [3]:
def calc_mean(array):
    return np.mean(array)

In [4]:
def train_model(model, cv, model_name, x_train, y_train):
    train_scores = dict()
    val_scores = dict()

    train_acc_scores = list()
    train_roc_scores = list()
    train_mcc_scores = list()
    train_f1_scores = list()

    acc_scores = list()
    roc_scores = list()
    mcc_scores = list()
    f1_scores = list()

    # cross validation
    for train, test in cv.split(x_train, y_train):
        model.fit(x_train[train], y_train[train])

        # predict on validation set
        train_preds = model.predict(x_train[train])
        preds = model.predict(x_train[test])

        # store metrics
        train_acc = accuracy_score(train_preds, y_train[train])
        train_acc_scores.append(train_acc)
        val_acc = accuracy_score(preds, y_train[test])
        acc_scores.append(val_acc)

        train_f1 = f1_score(train_preds, y_train[train])
        train_f1_scores.append(train_f1)
        val_f1 = f1_score(preds, y_train[test])
        f1_scores.append(val_f1)

        train_roc = roc_auc_score(train_preds, y_train[train])
        train_roc_scores.append(train_roc)
        val_roc = roc_auc_score(preds, y_train[test])
        roc_scores.append(val_roc)

        train_mcc = matthews_corrcoef(train_preds, y_train[train])
        train_mcc_scores.append(train_mcc)
        mcc = matthews_corrcoef(preds, y_train[test])
        mcc_scores.append(mcc)

    # metrics
    train_scores[model_name] = {
        'acc': calc_mean(train_acc_scores),
        'roc': calc_mean(train_roc_scores),
        'f1': calc_mean(train_f1_scores),
        'mcc': calc_mean(train_mcc_scores)
    }

    val_scores[model_name] = {
        'acc': calc_mean(acc_scores),
        'roc': calc_mean(roc_scores),
        'f1': calc_mean(f1_scores),
        'mcc': calc_mean(mcc_scores)
    }
    
    
    return train_scores, val_scores

## Load Data

In [5]:
train_df = pd.read_csv('data/train.csv', index_col=False)
test_df = pd.read_csv('data/test.csv')

In [6]:
train_df.head()

Unnamed: 0,match_id,period_id,pla_id,plb_id,score_a,score_b,race_a,race_b,comp_rat_a,comp_rat_vp_a,...,pla_race_P,pla_race_T,pla_race_Z,plb_race_P,plb_race_T,plb_race_Z,pla_eff_rating,plb_eff_rating,ratings_diff,winner
0,204283,168,422,2102,3,1,Z,P,1.362724,1.472933,...,0,0,1,1,0,0,1.472933,1.023097,0.449836,1
1,204881,168,962,12662,1,0,Z,Z,-1000.0,0.067802,...,0,0,1,0,0,1,0.15574,0.169566,-0.013826,1
2,204373,168,61,4551,0,2,Z,Z,-1000.0,-1000.0,...,0,0,1,0,0,1,0.12187,0.525932,-0.404062,0
3,24883,2,208,1218,4,2,Z,P,-1000.0,0.376888,...,0,0,1,1,0,0,0.376888,-0.391791,0.76868,1
4,205013,168,1100,10298,2,1,T,Z,0.97538,1.661578,...,0,1,0,0,0,1,0.49868,-0.364031,0.862712,1


In [7]:
test_df.head()

Unnamed: 0,match_id,period_id,pla_id,plb_id,score_a,score_b,race_a,race_b,comp_rat_a,comp_rat_vp_a,...,pla_race_P,pla_race_T,pla_race_Z,plb_race_P,plb_race_T,plb_race_Z,pla_eff_rating,plb_eff_rating,ratings_diff,winner
0,143772,88,4814,184,0,1,Z,T,0.048085,0.135338,...,0,0,1,0,1,0,0.303145,-2000.0,2000.303145,0
1,67383,88,3166,59,0,2,P,Z,-1000.0,-0.621009,...,1,0,0,0,0,1,-0.254441,-2000.0,1999.745559,0
2,142146,88,9531,9518,2,0,T,T,-1000.0,-0.477019,...,0,1,0,0,1,0,-0.058596,-3000.0,2999.941404,1
3,67190,88,4566,590,0,1,P,P,-1000.0,-3000.0,...,1,0,0,1,0,0,-3000.0,0.357552,-3000.357552,0
4,86874,88,1148,6104,2,0,Z,T,-1000.0,0.059067,...,0,0,1,0,1,0,-2000.0,-3000.0,1000.0,1


In [8]:
train_df.columns

Index(['match_id', 'period_id', 'pla_id', 'plb_id', 'score_a', 'score_b',
       'race_a', 'race_b', 'comp_rat_a', 'comp_rat_vp_a', 'comp_rat_vt_a',
       'comp_rat_vz_a', 'position_a', 'position_vp_a', 'position_vt_a',
       'position_vz_a', 'comp_rat_b', 'comp_rat_vp_b', 'comp_rat_vt_b',
       'comp_rat_vz_b', 'position_b', 'position_vp_b', 'position_vt_b',
       'position_vz_b', 'pla_race_P', 'pla_race_T', 'pla_race_Z', 'plb_race_P',
       'plb_race_T', 'plb_race_Z', 'pla_eff_rating', 'plb_eff_rating',
       'ratings_diff', 'winner'],
      dtype='object')

# Cross validation

In [9]:
n_folds = 5
kfold = KFold(n_splits=n_folds)

# Set up modeling set

In [10]:
feature_cols = list(train_df.columns[8:-1])
feature_cols
x_train, x_test = train_df[feature_cols].to_numpy(),\
                            test_df[feature_cols].to_numpy()

y_train, y_test = train_df['winner'].to_numpy(),\
                            test_df['winner'].to_numpy()

In [11]:
train_df[feature_cols]

Unnamed: 0,comp_rat_a,comp_rat_vp_a,comp_rat_vt_a,comp_rat_vz_a,position_a,position_vp_a,position_vt_a,position_vz_a,comp_rat_b,comp_rat_vp_b,...,position_vz_b,pla_race_P,pla_race_T,pla_race_Z,plb_race_P,plb_race_T,plb_race_Z,pla_eff_rating,plb_eff_rating,ratings_diff
0,1.362724,1.472933,1.946583,0.668656,24,16,24,32,0.729621,0.576785,...,69,0,0,1,1,0,0,1.472933,1.023097,0.449836
1,-1000.000000,0.067802,-1000.000000,0.155740,153,157,135,171,-1000.000000,-1000.000000,...,837,0,0,1,0,0,1,0.155740,0.169566,-0.013826
2,-1000.000000,-1000.000000,-0.614312,0.121870,179,175,151,219,-1000.000000,-1000.000000,...,607,0,0,1,0,0,1,0.121870,0.525932,-0.404062
3,-1000.000000,0.376888,-1000.000000,-1000.000000,13,13,12,13,-1000.000000,-1000.000000,...,59,0,0,1,1,0,0,0.376888,-0.391791,0.768680
4,0.975380,1.661578,0.765881,0.498680,142,164,144,119,-1000.000000,-2000.000000,...,1368,0,1,0,0,0,1,0.498680,-0.364031,0.862712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285417,-1000.000000,-3000.000000,-1000.000000,-2000.000000,294,271,415,264,-1000.000000,-2000.000000,...,102,0,1,0,1,0,0,-3000.000000,-2000.000000,-1000.000000
285418,-1000.000000,-3000.000000,-1000.000000,-1000.000000,989,850,760,1187,-1000.000000,0.773754,...,674,0,1,0,1,0,0,-3000.000000,-2000.000000,-1000.000000
285419,-1000.000000,-3000.000000,-0.052159,0.088179,1266,1390,801,1264,0.173264,0.425079,...,1262,0,1,0,1,0,0,-3000.000000,-0.169868,-2999.830132
285420,-1000.000000,-3000.000000,-3000.000000,0.444421,255,437,235,221,-1000.000000,-0.372412,...,270,1,0,0,0,0,1,0.444421,-0.372412,0.816832


# Modeling

## Logistic regression

In [12]:
logit = LogisticRegression()


In [13]:
logit_train, logit_val = train_funcs.train_and_measure(logit, kfold, 'logistic-regression', x_train, y_train)

AttributeError: 'str' object has no attribute 'decode'

In [19]:
logit_train

{'logistic-regression': {'acc': 0.7563086272954787,
  'roc': 0.751052498100214,
  'f1': 0.805576971115792,
  'mcc': 0.48539725959199564}}

In [20]:
logit_val

{'logistic-regression': {'acc': 0.7560633206244936,
  'roc': 0.7504599851647251,
  'f1': 0.8051498982239895,
  'mcc': 0.4845663133201489}}

In [None]:
logit.fit(x_train, y_train)
logit_preds = logit.predict(x_test)

In [None]:
logit_test = {
    'logit - no tuning': {
    'acc': accuracy_score(logit_preds, y_test),
    'f1': f1_score(logit_preds, y_test),
    'roc': roc_auc_score(logit_preds, y_test),
    'mcc': matthews_corrcoef(logit_preds, y_test)
    }
}

## SVM

In [62]:
svm = SVC(random_state=seed)
svm_train, svm_val = train_funcs.train_and_measure(svm, kfold, 'svm - no tuning', x_train, y_train)

In [63]:
svm_train

{'svm - no tuning': {'acc': 0.8308450644626701,
  'roc': 0.8404751725708565,
  'f1': 0.8674008406715894,
  'mcc': 0.6472133101973837}}

In [64]:
svm_val

{'svm - no tuning': {'acc': 0.8295815882622257,
  'roc': 0.838746720275249,
  'f1': 0.8663093862748562,
  'mcc': 0.6437911849970053}}

In [65]:
svm.fit(x_train, y_train)
svm_preds = svm.predict(x_test)

In [66]:
svm_test = {
    'svm - no tuning': {
    'acc': accuracy_score(svm_preds, y_test),
    'f1': f1_score(svm_preds, y_test),
    'roc': roc_auc_score(svm_preds, y_test),
    'mcc': matthews_corrcoef(svm_preds, y_test)
    }
}

In [67]:
svm_test

{'svm - no tuning': {'acc': 0.8355966645645014,
  'f1': 0.8729627584115742,
  'roc': 0.8387858165003576,
  'mcc': 0.6487771979232053}}

# Summary

In [None]:
# train_metrics = [xgb_train, cb_train, cb_tuned_train_metrics, cb_base_train_metrics]
# test_metrics = [xgb_test, cb_test, cb_tuned_test_metrics, cb_base_test_metrics]