# First Pick - Phase 2

# Notebook Imports

In [101]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay

import statistics
from statistics import mode
from scipy.stats import loguniform
from time import time

import joblib

import time

# Multilabel Classification

## MultiOutputClassifier

### Data Prep

In [102]:
df_reshaped = pd.read_pickle('./saved_files/data_final.pickle')
# defining features and targets

# select features
fp_2p_features = df_reshaped[['left_pick_order_1', 'left_pick_order_2', 'left_pick_order_3',
                              'right_pick_order_1', 'right_pick_order_2', 
                              'left_attack_damage_1', 'left_attack_damage_2', 'left_attack_damage_3',
                              'left_receive_damage_1', 'left_receive_damage_2', 'left_receive_damage_3',
                              'left_kill_count_1', 'left_kill_count_2', 'left_kill_count_3',
                              'right_attack_damage_1', 'right_attack_damage_2',
                              'right_receive_damage_1', 'right_receive_damage_2',
                              'right_kill_count_1', 'right_kill_count_2', 
                              'left_attribute_cd_1', 
                              'left_hero_code_1', 
                              'left_job_cd_1', 
                              'right_hero_code_1', 'right_hero_code_2',
                              'right_attribute_cd_1', 'right_attribute_cd_2',
                              'right_job_cd_1', 'right_job_cd_2',
                              'left_preban_1', 'left_preban_2', 
                              'right_preban_1', 'right_preban_2', 'first_pick', 'is_win']]

fp_2p_target = df_reshaped[['left_hero_code_2', 'left_hero_code_3']]

X_train_fp_2p, X_test_fp_2p, y_train_fp_2p, y_test_fp_2p = train_test_split(fp_2p_features, fp_2p_target, test_size= 0.2, shuffle = True, random_state= 890)

# Scaling 
# Define numeric features
fp_2p_numeric = ['left_attack_damage_1', 'left_attack_damage_2', 'left_attack_damage_3', 
                  'left_receive_damage_1', 'left_receive_damage_2', 'left_receive_damage_3',
                  'left_kill_count_1', 'left_kill_count_2', 'left_kill_count_3',
                  'right_attack_damage_1', 'right_attack_damage_2', 
                  'right_receive_damage_1', 'right_receive_damage_2',
                  'right_kill_count_1', 'right_kill_count_2']

In [103]:
fp_2p_target

Unnamed: 0,left_hero_code_2,left_hero_code_3
0,Blood Moon Haste,Moon Bunny Dominiel
1,Ambitious Tywin,Ravi
2,Blood Moon Haste,Aria
3,Blood Moon Haste,Moon Bunny Dominiel
4,Ambitious Tywin,Moon Bunny Dominiel
...,...,...
9941,Abyssal Yufine,Crimson Armin
9942,Laia,Crimson Armin
9943,Laia,Dragon Bride Senya
9944,Abyssal Yufine,Crimson Armin


In [104]:
# Scale numeric features
fp_2p_scaler = StandardScaler()
fp_2p_scaler.fit(X_train_fp_2p[fp_2p_numeric])

In [105]:
X_train_fp_2p_trans = pd.DataFrame()
X_test_fp_2p_trans = pd.DataFrame()

X_train_fp_2p_trans[fp_2p_numeric] = fp_2p_scaler.transform(X_train_fp_2p[fp_2p_numeric])
X_test_fp_2p_trans[fp_2p_numeric] = fp_2p_scaler.transform(X_test_fp_2p[fp_2p_numeric])

In [106]:
# Encoding the categorical features
# Initializing an ordinal encoder for categorical variables
fp_2p_enc = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)
# categorical labels
fp_2p_categorical = ['left_attribute_cd_1', 
                     'left_hero_code_1',
                     'left_job_cd_1',
                     'right_hero_code_1', 'right_hero_code_2',
                     'right_attribute_cd_1', 'right_attribute_cd_2',
                     'right_job_cd_1', 'right_job_cd_2', 
                     'left_preban_1', 'left_preban_2', 
                     'right_preban_1', 'right_preban_2']

X_train_fp_2p_trans[fp_2p_categorical] = fp_2p_enc.fit_transform(X_train_fp_2p[fp_2p_categorical].to_numpy())
X_test_fp_2p_trans[fp_2p_categorical] = fp_2p_enc.transform(X_test_fp_2p[fp_2p_categorical].to_numpy())

In [107]:
X_train_fp_2p_trans.index
X_test_fp_2p_trans.index

RangeIndex(start=0, stop=1979, step=1)

In [108]:
X_train_fp_2p = X_train_fp_2p.reset_index(drop=True)
X_test_fp_2p = X_test_fp_2p.reset_index(drop=True)

In [109]:
X_train_fp_2p.index
X_test_fp_2p.index

RangeIndex(start=0, stop=1979, step=1)

In [110]:
X_test_fp_2p.isna().sum()
X_test_fp_2p_trans.isna().sum()

left_attack_damage_1      0
left_attack_damage_2      0
left_attack_damage_3      0
left_receive_damage_1     0
left_receive_damage_2     0
left_receive_damage_3     0
left_kill_count_1         0
left_kill_count_2         0
left_kill_count_3         0
right_attack_damage_1     0
right_attack_damage_2     0
right_receive_damage_1    0
right_receive_damage_2    0
right_kill_count_1        0
right_kill_count_2        0
left_attribute_cd_1       0
left_hero_code_1          0
left_job_cd_1             0
right_hero_code_1         0
right_hero_code_2         0
right_attribute_cd_1      0
right_attribute_cd_2      0
right_job_cd_1            0
right_job_cd_2            0
left_preban_1             0
left_preban_2             0
right_preban_1            0
right_preban_2            0
dtype: int64

In [111]:
# add features left out of scaling
l0f_fp_2p = ['left_pick_order_1', 'left_pick_order_2', 'left_pick_order_3',
                    'right_pick_order_1', 'right_pick_order_2', 
                   'first_pick', 'is_win']

X_train_fp_2p_trans[l0f_fp_2p] = X_train_fp_2p[l0f_fp_2p]
X_test_fp_2p_trans[l0f_fp_2p] = X_test_fp_2p[l0f_fp_2p]

In [112]:
X_test_fp_2p_trans[l0f_fp_2p]

Unnamed: 0,left_pick_order_1,left_pick_order_2,left_pick_order_3,right_pick_order_1,right_pick_order_2,first_pick,is_win
0,1,2,3,1,2,0,2
1,1,2,3,1,2,0,2
2,1,2,3,1,2,1,2
3,1,2,3,1,2,0,1
4,1,2,3,1,2,0,2
...,...,...,...,...,...,...,...
1974,1,2,3,1,2,0,2
1975,1,2,3,1,2,0,1
1976,1,2,3,1,2,0,2
1977,1,2,3,1,2,1,1


In [113]:
X_test_fp_2p_trans.isna().sum()

left_attack_damage_1      0
left_attack_damage_2      0
left_attack_damage_3      0
left_receive_damage_1     0
left_receive_damage_2     0
left_receive_damage_3     0
left_kill_count_1         0
left_kill_count_2         0
left_kill_count_3         0
right_attack_damage_1     0
right_attack_damage_2     0
right_receive_damage_1    0
right_receive_damage_2    0
right_kill_count_1        0
right_kill_count_2        0
left_attribute_cd_1       0
left_hero_code_1          0
left_job_cd_1             0
right_hero_code_1         0
right_hero_code_2         0
right_attribute_cd_1      0
right_attribute_cd_2      0
right_job_cd_1            0
right_job_cd_2            0
left_preban_1             0
left_preban_2             0
right_preban_1            0
right_preban_2            0
left_pick_order_1         0
left_pick_order_2         0
left_pick_order_3         0
right_pick_order_1        0
right_pick_order_2        0
first_pick                0
is_win                    0
dtype: int64

In [114]:
# Create a copy of training data after encoding
new_X_train_fp_2p = X_train_fp_2p.copy()

In [115]:
X_train_fp_2p.columns

Index(['left_pick_order_1', 'left_pick_order_2', 'left_pick_order_3',
       'right_pick_order_1', 'right_pick_order_2', 'left_attack_damage_1',
       'left_attack_damage_2', 'left_attack_damage_3', 'left_receive_damage_1',
       'left_receive_damage_2', 'left_receive_damage_3', 'left_kill_count_1',
       'left_kill_count_2', 'left_kill_count_3', 'right_attack_damage_1',
       'right_attack_damage_2', 'right_receive_damage_1',
       'right_receive_damage_2', 'right_kill_count_1', 'right_kill_count_2',
       'left_attribute_cd_1', 'left_hero_code_1', 'left_job_cd_1',
       'right_hero_code_1', 'right_hero_code_2', 'right_attribute_cd_1',
       'right_attribute_cd_2', 'right_job_cd_1', 'right_job_cd_2',
       'left_preban_1', 'left_preban_2', 'right_preban_1', 'right_preban_2',
       'first_pick', 'is_win'],
      dtype='object')

In [116]:
# Change columns names in the copied data to concatinate with the original data
new_X_train_fp_2p.columns = ['left_pick_order_1_orig', 'left_pick_order_2_orig', 'left_pick_order_3_orig', 
                             'right_pick_order_1_orig', 'right_pick_order_2_orig', 
                             'left_attack_damage_1_orig', 'left_attack_damage_2_orig', 'left_attack_damage_3_orig', 
                             'left_receive_damage_1_orig', 'left_receive_damage_2_orig', 'left_receive_damage_3_orig',
                             'left_kill_count_1_orig', 'left_kill_count_2_orig', 'left_kill_count_3_orig',
                             'right_attack_damage_1_orig', 'right_attack_damage_2_orig', 
                             'right_receive_damage_1_orig', 'right_receive_damage_2_orig',
                             'right_kill_count_1_orig', 'right_kill_count_2_orig',
                             'left_attribute_cd_1_orig',
                             'left_hero_code_1_orig',
                             'left_job_cd_1_orig', 
                             'right_hero_code_1_orig', 'right_hero_code_2_orig', 
                             'right_attribute_cd_1_orig', 'right_attribute_cd_2_orig', 
                             'right_job_cd_1_orig', 'right_job_cd_2_orig', 
                             'left_preban_1_orig', 'left_preban_2_orig', 
                             'right_preban_1_orig', 'right_preban_2_orig', 
                             'first_pick_orig', 'is_win_orig']

In [117]:
len(new_X_train_fp_2p.columns)

35

In [118]:
# Cocatenate original data with encoded data for encoding names later in the function
dict_fp_2p = pd.concat((X_train_fp_2p_trans,new_X_train_fp_2p.reset_index(drop=True)), axis= 1).copy()
len(dict_fp_2p.columns)

70

In [119]:
dict_fp_2p[['left_hero_code_1', 'left_hero_code_1_orig']]

Unnamed: 0,left_hero_code_1,left_hero_code_1_orig
0,33.0,Sea Phantom Politis
1,25.0,New Moon Luna
2,19.0,Laia
3,13.0,Death Dealer Ray
4,19.0,Laia
...,...,...
7908,18.0,Jenua
7909,39.0,Zio
7910,15.0,Dragon Bride Senya
7911,13.0,Death Dealer Ray


#### Saving a file of Hero Codes & Names

In [77]:
# Save the concatinated df
# dict_fp_2p.to_pickle('./saved_files/dict_fp_2p.pickle')

### Random Forest Classifier

#### Tunning Hyperparameters

In [20]:
# RandomizedSearchCV parameters
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt', 'log2', 2, 3, None]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 14, 18, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'estimator__n_estimators': n_estimators,
               'estimator__max_features': max_features,
               'estimator__max_depth': max_depth,
               'estimator__min_samples_split': min_samples_split,
               'estimator__min_samples_leaf': min_samples_leaf,
               'estimator__bootstrap': bootstrap}
print(random_grid)

{'estimator__n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'estimator__max_features': ['sqrt', 'log2', 2, 3, None], 'estimator__max_depth': [10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, None], 'estimator__min_samples_split': [2, 5, 10, 14, 18, 20], 'estimator__min_samples_leaf': [1, 2, 4], 'estimator__bootstrap': [True, False]}


In [19]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = MultiOutputClassifier(RandomForestClassifier())
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=890)
# Fit the random search model
rf_random.fit(X_train_fp_2p_trans, y_train_fp_2p)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END estimator__bootstrap=True, estimator__max_depth=26, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=10, estimator__n_estimators=350; total time=   3.7s
[CV] END estimator__bootstrap=True, estimator__max_depth=26, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=10, estimator__n_estimators=350; total time=   3.5s
[CV] END estimator__bootstrap=True, estimator__max_depth=26, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=10, estimator__n_estimators=350; total time=   3.5s
[CV] END estimator__bootstrap=False, estimator__max_depth=14, estimator__max_features=log2, estimator__min_samples_leaf=4, estimator__min_samples_split=14, estimator__n_estimators=250; total time=   2.5s
[CV] END estimator__bootstrap=False, estimator__max_depth=14, estimator__max_features=log2, estimator__min_samples_leaf=4, e

In [None]:
# View the best parameters from fitting the random search
rf_random.best_params_

{'estimator__n_estimators': 288,
 'estimator__min_samples_split': 2,
 'estimator__min_samples_leaf': 1,
 'estimator__max_features': 'log2',
 'estimator__max_depth': 50,
 'estimator__bootstrap': False}

In [23]:
# View the best parameters from fitting the random search
# Excluded features: pick order, kill count, attribute, job
rf_random.best_params_

{'estimator__n_estimators': 450,
 'estimator__min_samples_split': 2,
 'estimator__min_samples_leaf': 1,
 'estimator__max_features': 'log2',
 'estimator__max_depth': None,
 'estimator__bootstrap': False}

In [20]:
# View the best parameters from fitting the random search
# Included features: hero codes, prebans, win
rf_random.best_params_

{'estimator__n_estimators': 250,
 'estimator__min_samples_split': 14,
 'estimator__min_samples_leaf': 1,
 'estimator__max_features': None,
 'estimator__max_depth': 26,
 'estimator__bootstrap': True}

#### Training and Evaluating

In [78]:
# Using RandomForestClassifier for multioutput and evaluaing the result
classifier_fp_2p = MultiOutputClassifier(RandomForestClassifier(random_state=890, max_depth = 50,
                                                                n_estimators=288, min_samples_split = 2,
                                                                min_samples_leaf = 1, max_features = 'log2',
                                                                bootstrap = False))
classifier_fp_2p.fit(X_train_fp_2p_trans, y_train_fp_2p)
predictions_fp_2p = classifier_fp_2p.predict(X_test_fp_2p_trans)
classifier_fp_2p.score(X_test_fp_2p_trans, y_test_fp_2p)

0.21172309247094492

In [24]:
# Using RandomForestClassifier for multioutput and evaluaing the result
# Excluded features: pick order, kill count, attribute, job
classifier_fp_2p = MultiOutputClassifier(RandomForestClassifier(random_state=890, max_depth = None,
                                                                n_estimators=450, min_samples_split = 2,
                                                                min_samples_leaf = 1, max_features = 'log2',
                                                                bootstrap = False))
classifier_fp_2p.fit(X_train_fp_2p_trans, y_train_fp_2p)
predictions_fp_2p = classifier_fp_2p.predict(X_test_fp_2p_trans)
classifier_fp_2p.score(X_test_fp_2p_trans, y_test_fp_2p)

0.1753410813542193

In [22]:
# Using RandomForestClassifier for multioutput and evaluaing the result
# Included features: hero codes, prebans, win
classifier_fp_2p = MultiOutputClassifier(RandomForestClassifier(random_state=890, max_depth = 26,
                                                                n_estimators=250, min_samples_split = 14,
                                                                min_samples_leaf = 1, max_features = None,
                                                                bootstrap = True))
classifier_fp_2p.fit(X_train_fp_2p_trans, y_train_fp_2p)
predictions_fp_2p = classifier_fp_2p.predict(X_test_fp_2p_trans)
classifier_fp_2p.score(X_test_fp_2p_trans, y_test_fp_2p)

0.14098029307731177

In [26]:
X_train_fp_2p_trans.isna().sum()

left_attack_damage_1      0
left_attack_damage_2      0
left_attack_damage_3      0
left_receive_damage_1     0
left_receive_damage_2     0
left_receive_damage_3     0
left_kill_count_1         0
left_kill_count_2         0
left_kill_count_3         0
right_attack_damage_1     0
right_attack_damage_2     0
right_receive_damage_1    0
right_receive_damage_2    0
right_kill_count_1        0
right_kill_count_2        0
left_attribute_cd_1       0
left_hero_code_1          0
left_job_cd_1             0
right_hero_code_1         0
right_hero_code_2         0
right_attribute_cd_1      0
right_attribute_cd_2      0
right_job_cd_1            0
right_job_cd_2            0
left_preban_1             0
left_preban_2             0
right_preban_1            0
right_preban_2            0
left_pick_order_1         0
left_pick_order_2         0
left_pick_order_3         0
right_pick_order_1        0
right_pick_order_2        0
first_pick                0
is_win                    0
dtype: int64

In [31]:
# Class probabilities of test features
y_score = classifier_fp_2p.predict_proba(X_test_fp_2p_trans)
y_score

[array([[0.        , 0.16319444, 0.        , ..., 0.00347222, 0.        ,
         0.        ],
        [0.00347222, 0.02083333, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.04166667, 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.00347222, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.05555556, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.02777778, 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 array([[0.        , 0.02083333, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.00347222, 0.00694444, 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.00347222, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0. 

multiclass-multioutput is not supported:
1. binarizer
2. recall, accuracy, f1
3. confusion matrix
4. roc auc 
    

#### Saving Best Model

In [None]:
#saving the model
joblib.dump(classifier_fp_2p, './saved_files/classifier_fp_2p.joblib')

['classifier_fp_2p.joblib']

### Support Vector Machine

In [81]:
# Create the SVM
svm = SVC(random_state=42)

In [82]:
# Make it an Multilabel classifier
multilabel_classifier = MultiOutputClassifier(svm, n_jobs=-1)

In [83]:
# Fit the data to the Multilabel classifier
multilabel_classifier = multilabel_classifier.fit(X_train_fp_2p_trans, y_train_fp_2p)

In [89]:
# Get predictions for test data
predictions_fp_2p_svm = multilabel_classifier.predict(X_test_fp_2p_trans)

In [93]:
multilabel_classifier.score(X_test_fp_2p_trans, y_test_fp_2p)

0.01414855987872663

### Support Vector Machine Radial Basis Function (RBF)

In [None]:
# Random Search for alpha and c params
print('Fitting the classifier to the training set')
t0 = time.time()
param_grid = {
    'C': loguniform(1e3, 1e5),
    'gamma': loguniform(1e-4, 1e-1),
}
SVC_random = RandomizedSearchCV(
    SVC(kernel='rbf',
        #class_weight='balanced'
        ),
        param_grid, n_iter=10, cv = 5)
clf = clf.fit(X_train_fp_2p_trans, y_train_fp_2p)
print('done in %0.3fs' % (time.time() - t0))
print('Best estimator found by grid search:')
print(clf.best_estimator_)

In [120]:
# Create the SVM
svm = SVC(kernel='rbf', gamma=0.00047588699372380985, C=4815.839154019686, random_state= 890)

In [121]:
# Make it an Multilabel classifier
multilabel_classifier = MultiOutputClassifier(svm, n_jobs=-1)

In [124]:
# Fit the data to the Multilabel classifier
multilabel_classifier = multilabel_classifier.fit(X_train_fp_2p_trans, y_train_fp_2p)

In [None]:
# Get predictions for test data
predictions_fp_2p_svm = multilabel_classifier.predict(X_test_fp_2p_trans)

In [125]:
multilabel_classifier.score(X_test_fp_2p_trans, y_test_fp_2p)

0.08388074785245073

## ClassifierChain