# Notebook Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm
from sklearn.model_selection import RandomizedSearchCV

import statistics
from statistics import mode

import joblib

import time

<a id='sp1p_link'></a>
### Second Pick - Phase 1

In [None]:
df_reshaped = pd.read_pickle('./data_final.pickle')
# defining features and targets

# selected features for the 1st iter with Ordinal Encoding
sp_1p_features = df_reshaped[['left_pick_order_1', 'left_pick_order_2', 
                              'right_pick_order_1',
                              'left_attack_damage_1', 'left_attack_damage_2',
                              'left_receive_damage_1', 'left_receive_damage_2',
                              'left_kill_count_1', 'left_kill_count_2',
                              'right_attack_damage_1',
                              'right_receive_damage_1', 
                              'right_kill_count_1',
                              'left_attribute_cd_1', 'left_attribute_cd_2', 
                              'left_job_cd_1', 'left_job_cd_2',
                              'right_hero_code_1', 
                              'right_attribute_cd_1', 
                              'right_job_cd_1', 
#                              'left_postban', 'right_postban',
                              'left_preban_1', 'left_preban_2', 
                              'right_preban_1', 'right_preban_2',
                              'first_pick', 'is_win']]

sp_1p_target = df_reshaped[['left_hero_code_1', 'left_hero_code_2']]

X_train_sp_1p, X_test_sp_1p, y_train_sp_1p, y_test_sp_1p = train_test_split(sp_1p_features, sp_1p_target, test_size= 0.2, shuffle = True, random_state= 890)



# Scaling 
# Define numeric labels
sp_1p_numeric = ['left_attack_damage_1', 'left_attack_damage_2',
                 'left_receive_damage_1', 'left_receive_damage_2', 
                 'left_kill_count_1', 'left_kill_count_2',
                 'right_attack_damage_1',
                 'right_receive_damage_1', 
                 'right_kill_count_1']


In [None]:
# Scale numeric features
sp_1p_scaler = StandardScaler()
sp_1p_scaler.fit(X_train_sp_1p[sp_1p_numeric])

In [None]:
X_train_sp_1p_trans = pd.DataFrame()
X_test_sp_1p_trans = pd.DataFrame()

X_train_sp_1p_trans[sp_1p_numeric] = sp_1p_scaler.transform(X_train_sp_1p[sp_1p_numeric])
X_test_sp_1p_trans[sp_1p_numeric] = sp_1p_scaler.transform(X_test_sp_1p[sp_1p_numeric])

In [None]:
# Encoding the categorical features
# Initializing an ordinal encoder for categorical variables
sp_1p_enc = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)
# categorical labels for the 1st iter
sp_1p_categorical = ['left_attribute_cd_1', 'left_attribute_cd_2',
                     'left_job_cd_1', 'left_job_cd_2',
                     'right_hero_code_1',
                     'right_attribute_cd_1',
                     'right_job_cd_1', 
#                     'left_postban', 'right_postban',
                     'left_preban_1', 'left_preban_2',
                     'right_preban_1', 'right_preban_2']

X_train_sp_1p_trans[sp_1p_categorical] = sp_1p_enc.fit_transform(X_train_sp_1p[sp_1p_categorical].to_numpy())
X_test_sp_1p_trans[sp_1p_categorical] = sp_1p_enc.transform(X_test_sp_1p[sp_1p_categorical].to_numpy())

In [None]:
X_train_sp_1p_trans.index
X_test_sp_1p_trans.index

RangeIndex(start=0, stop=1979, step=1)

In [None]:
X_train_sp_1p.index
X_test_sp_1p.index

RangeIndex(start=0, stop=1979, step=1)

In [None]:
X_test_sp_1p.isna().sum()
X_test_sp_1p_trans.isna().sum()

left_attack_damage_1      0
left_attack_damage_2      0
left_receive_damage_1     0
left_receive_damage_2     0
left_kill_count_1         0
left_kill_count_2         0
right_attack_damage_1     0
right_receive_damage_1    0
right_kill_count_1        0
left_attribute_cd_1       0
left_attribute_cd_2       0
left_job_cd_1             0
left_job_cd_2             0
right_hero_code_1         0
right_attribute_cd_1      0
right_job_cd_1            0
left_preban_1             0
left_preban_2             0
right_preban_1            0
right_preban_2            0
dtype: int64

In [None]:
X_test_sp_1p_trans.shape

(1979, 20)

In [None]:
X_test_sp_1p.shape

(1979, 25)

In [None]:
# add features left out of scaling
l0f_sp_1p = ['left_pick_order_1', 'left_pick_order_2',
             'right_pick_order_1', 
             'first_pick', 'is_win']

X_train_sp_1p_trans[l0f_sp_1p] = X_train_sp_1p[l0f_sp_1p]
X_test_sp_1p_trans[l0f_sp_1p] = X_test_sp_1p[l0f_sp_1p]

In [None]:
X_test_sp_1p_trans[l0f_sp_1p]

Unnamed: 0,left_pick_order_1,left_pick_order_2,right_pick_order_1,first_pick,is_win
0,1,2,1,0,2
1,1,2,1,0,2
2,1,2,1,1,2
3,1,2,1,0,1
4,1,2,1,0,2
...,...,...,...,...,...
1974,1,2,1,0,2
1975,1,2,1,0,1
1976,1,2,1,0,2
1977,1,2,1,1,1


<a id='sp1p_link'></a>
### Tunning Hyperparameters for the Second Pick - Phase 1

In [None]:
X_test_sp_1p_trans.isna().sum()

left_attack_damage_1      0
left_attack_damage_2      0
left_receive_damage_1     0
left_receive_damage_2     0
left_kill_count_1         0
left_kill_count_2         0
right_attack_damage_1     0
right_receive_damage_1    0
right_kill_count_1        0
left_attribute_cd_1       0
left_attribute_cd_2       0
left_job_cd_1             0
left_job_cd_2             0
right_hero_code_1         0
right_attribute_cd_1      0
right_job_cd_1            0
left_preban_1             0
left_preban_2             0
right_preban_1            0
right_preban_2            0
left_pick_order_1         0
left_pick_order_2         0
right_pick_order_1        0
first_pick                0
is_win                    0
dtype: int64

In [None]:
# Create a copy of training data after encoding
new_X_train_sp_1p = X_train_sp_1p.copy()

In [None]:
X_train_sp_1p.columns

Index(['left_pick_order_1', 'left_pick_order_2', 'right_pick_order_1',
       'left_attack_damage_1', 'left_attack_damage_2', 'left_receive_damage_1',
       'left_receive_damage_2', 'left_kill_count_1', 'left_kill_count_2',
       'right_attack_damage_1', 'right_receive_damage_1', 'right_kill_count_1',
       'left_attribute_cd_1', 'left_attribute_cd_2', 'left_job_cd_1',
       'left_job_cd_2', 'right_hero_code_1', 'right_attribute_cd_1',
       'right_job_cd_1', 'left_preban_1', 'left_preban_2', 'right_preban_1',
       'right_preban_2', 'first_pick', 'is_win'],
      dtype='object')

In [None]:
# Change columns names in the copied data to concatinate with the original data
new_X_train_sp_1p.columns = ['left_pick_order_1_orig', 'left_pick_order_2_orig', 
                             'right_pick_order_1_orig', 
                             'left_attack_damage_1_orig', 'left_attack_damage_2_orig',
                             'left_receive_damage_1_orig', 'left_receive_damage_2_orig',
                             'left_kill_count_1_orig', 'left_kill_count_2_orig', 
                             'right_attack_damage_1_orig', 
                             'right_receive_damage_1_orig', 
                             'right_kill_count_1_orig', 
                             'left_attribute_cd_1_orig', 'left_attribute_cd_2_orig',
                             'left_job_cd_1_orig', 'left_job_cd_2_orig',
                             'right_hero_code_1_orig',
                             'right_attribute_cd_1_orig', 
                             'right_job_cd_1_orig',
                             'left_preban_1_orig', 'left_preban_2_orig',
                             'right_preban_1_orig', 'right_preban_2_orig',
                             'first_pick_orig', 'is_win_orig']

In [None]:
len(new_X_train_sp_1p.columns)

25

In [None]:
# Cocatenate original data with encoded data for later use in the function
dict_sp_1p = pd.concat((X_train_sp_1p_trans,new_X_train_sp_1p.reset_index(drop=True)), axis= 1).copy()
len(dict_sp_1p.columns)

50

In [None]:
dict_sp_1p[['right_hero_code_1', 'right_hero_code_1_orig']]

Unnamed: 0,right_hero_code_1,right_hero_code_1_orig
0,3.0,Ambitious Tywin
1,30.0,Laia
2,15.0,Death Dealer Ray
3,30.0,Laia
4,41.0,New Moon Luna
...,...,...
7908,17.0,Dragon Bride Senya
7909,41.0,New Moon Luna
7910,41.0,New Moon Luna
7911,30.0,Laia


<a id='sp1p_link'></a>
### Tunning Hyperparameters for the Second Pick - Phase 1

In [None]:
# RandomizedSearchCV parameters
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'estimator__n_estimators': n_estimators,
               'estimator__max_features': max_features,
               'estimator__max_depth': max_depth,
               'estimator__min_samples_split': min_samples_split,
               'estimator__min_samples_leaf': min_samples_leaf,
               'estimator__bootstrap': bootstrap}
print(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, None],
 'max_features': ['sqrt', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 288, 377, 466, 555, 644, 733, 822, 911, 1000]}


{'estimator__n_estimators': [200, 288, 377, 466, 555, 644, 733, 822, 911, 1000], 'estimator__max_features': ['sqrt', 'log2'], 'estimator__max_depth': [10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, None], 'estimator__min_samples_split': [2, 5, 10], 'estimator__min_samples_leaf': [1, 2, 4], 'estimator__bootstrap': [True, False]}


{'bootstrap': [True, False],
 'max_depth': [10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, None],
 'max_features': ['sqrt', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 288, 377, 466, 555, 644, 733, 822, 911, 1000]}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = MultiOutputClassifier(RandomForestClassifier())
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=890)
# Fit the random search model
rf_random.fit(X_train_sp_1p_trans, y_train_sp_1p)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END estimator__bootstrap=True, estimator__max_depth=38, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=5, estimator__n_estimators=377; total time=   4.9s
[CV] END estimator__bootstrap=True, estimator__max_depth=38, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=5, estimator__n_estimators=377; total time=   4.8s
[CV] END estimator__bootstrap=True, estimator__max_depth=38, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=5, estimator__n_estimators=377; total time=   5.0s
[CV] END estimator__bootstrap=True, estimator__max_depth=46, estimator__max_features=sqrt, estimator__min_samples_leaf=4, estimator__min_samples_split=5, estimator__n_estimators=466; total time=   5.9s
[CV] END estimator__bootstrap=True, estimator__max_depth=46, estimator__max_features=sqrt, estimator__min_samples_leaf=4, estimat

In [None]:
# View the best parameters from fitting the random search
rf_random.best_params_

{'estimator__n_estimators': 555,
 'estimator__min_samples_split': 2,
 'estimator__min_samples_leaf': 1,
 'estimator__max_features': 'sqrt',
 'estimator__max_depth': 42,
 'estimator__bootstrap': False}

In [None]:
# Using RandomForestClassifier for multioutput and evaluaing the result after Ordinal Encoding
classifier_sp_1p = MultiOutputClassifier(RandomForestClassifier(random_state=890, max_depth = 42, n_estimators=555, min_samples_split = 2, min_samples_leaf = 1, max_features = 'sqrt', bootstrap = False))
classifier_sp_1p.fit(X_train_sp_1p_trans, y_train_sp_1p)
sp_1p_predictions= classifier_sp_1p.predict(X_test_sp_1p_trans)
classifier_sp_1p.score(X_test_sp_1p_trans, y_test_sp_1p)

0.786255684689237

In [None]:
#saving the model
joblib.dump(classifier_sp_1p, './saved_files/classifier_sp_1p.joblib')

['classifier_sp_1p.joblib']