In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

%config InlineBackend.figure_format = 'retina'

In [2]:
games = pd.read_csv('data/games.csv')
games.columns

Index(['gameId', 'creationTime', 'gameDuration', 'seasonId', 'winner',
       'firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron',
       'firstDragon', 'firstRiftHerald', 't1_champ1id', 't1_champ1_sum1',
       't1_champ1_sum2', 't1_champ2id', 't1_champ2_sum1', 't1_champ2_sum2',
       't1_champ3id', 't1_champ3_sum1', 't1_champ3_sum2', 't1_champ4id',
       't1_champ4_sum1', 't1_champ4_sum2', 't1_champ5id', 't1_champ5_sum1',
       't1_champ5_sum2', 't1_towerKills', 't1_inhibitorKills', 't1_baronKills',
       't1_dragonKills', 't1_riftHeraldKills', 't1_ban1', 't1_ban2', 't1_ban3',
       't1_ban4', 't1_ban5', 't2_champ1id', 't2_champ1_sum1', 't2_champ1_sum2',
       't2_champ2id', 't2_champ2_sum1', 't2_champ2_sum2', 't2_champ3id',
       't2_champ3_sum1', 't2_champ3_sum2', 't2_champ4id', 't2_champ4_sum1',
       't2_champ4_sum2', 't2_champ5id', 't2_champ5_sum1', 't2_champ5_sum2',
       't2_towerKills', 't2_inhibitorKills', 't2_baronKills', 't2_dragonKills',
       't2_riftHer

We use `gameId` as index. `creationTime` and `gameDuration` are considered irrelavant to prediction, so we remove them from the data set. All game instances are in season 9, so we could remove the impact of seasons out of consideration.

In [3]:
# get data set ready for training
        
games = games.set_index('gameId').drop(labels=['creationTime', 'gameDuration', 'seasonId'], axis=1)

# every column should be categorical
# - champion names, tags and summoner spells names are in nature categorical
# - first target indicates which team gets first target, so categorical
# - number of kills are ordinal, so categorical as well. 

for col_name in games.columns:
    games[col_name] = games[col_name].astype('category')
    
games.columns

Index(['winner', 'firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron',
       'firstDragon', 'firstRiftHerald', 't1_champ1id', 't1_champ1_sum1',
       't1_champ1_sum2', 't1_champ2id', 't1_champ2_sum1', 't1_champ2_sum2',
       't1_champ3id', 't1_champ3_sum1', 't1_champ3_sum2', 't1_champ4id',
       't1_champ4_sum1', 't1_champ4_sum2', 't1_champ5id', 't1_champ5_sum1',
       't1_champ5_sum2', 't1_towerKills', 't1_inhibitorKills', 't1_baronKills',
       't1_dragonKills', 't1_riftHeraldKills', 't1_ban1', 't1_ban2', 't1_ban3',
       't1_ban4', 't1_ban5', 't2_champ1id', 't2_champ1_sum1', 't2_champ1_sum2',
       't2_champ2id', 't2_champ2_sum1', 't2_champ2_sum2', 't2_champ3id',
       't2_champ3_sum1', 't2_champ3_sum2', 't2_champ4id', 't2_champ4_sum1',
       't2_champ4_sum2', 't2_champ5id', 't2_champ5_sum1', 't2_champ5_sum2',
       't2_towerKills', 't2_inhibitorKills', 't2_baronKills', 't2_dragonKills',
       't2_riftHeraldKills', 't2_ban1', 't2_ban2', 't2_ban3', 't2_ban4',
      

### Model Training

Pregame model: 

Predict the game result after ban-pick phase, namely with the information of picked champions, banned champions and the summoner spells each champion brings to the game. 

In [4]:
pre_x = games[['t1_champ1id', 't1_champ1_sum1',
                't1_champ1_sum2', 't1_champ2id', 't1_champ2_sum1', 't1_champ2_sum2',
                't1_champ3id', 't1_champ3_sum1', 't1_champ3_sum2', 't1_champ4id',
                't1_champ4_sum1', 't1_champ4_sum2', 't1_champ5id', 't1_champ5_sum1',
                't1_champ5_sum2', 't1_ban1', 't1_ban2', 't1_ban3',
                't1_ban4', 't1_ban5', 't2_champ1id', 't2_champ1_sum1', 't2_champ1_sum2',
                't2_champ2id', 't2_champ2_sum1', 't2_champ2_sum2', 't2_champ3id',
                't2_champ3_sum1', 't2_champ3_sum2', 't2_champ4id', 't2_champ4_sum1',
                't2_champ4_sum2', 't2_champ5id', 't2_champ5_sum1', 't2_champ5_sum2',
                't2_ban1', 't2_ban2', 't2_ban3', 't2_ban4',
                't2_ban5']].copy()

pre_y = games['winner'].copy()
pre_x_train, pre_x_test, pre_y_train, pre_y_test = train_test_split(pre_x, pre_y, test_size=0.3, random_state=42)
len(pre_x_train), len(pre_x_test), len(pre_y_train), len(pre_y_test)

(36043, 15447, 36043, 15447)

In [5]:
params = {'max_depth': range(4, 30, 2)}
pre_dt = GridSearchCV(DecisionTreeClassifier(), params, n_jobs=4)
pre_dt.fit(pre_x_train, pre_y_train)
pre_best_dt = pre_dt.best_estimator_
accuracy_score(pre_best_dt.predict(pre_x_test), pre_y_test)

0.5122030167670097

In [6]:
params = {'max_depth': range(4, 30, 2)}
pre_rf = GridSearchCV(RandomForestClassifier(), params, n_jobs=4)
pre_rf.fit(pre_x_train, pre_y_train)
pre_best_rf = pre_rf.best_estimator_
accuracy_score(pre_best_rf.predict(pre_x_test), pre_y_test)

0.5208131028678707

In [7]:
params = {'loss': ['hinge', 'log'], 'alpha': [0.0001, 0.001, 0.01, 0.1]}
pre_lc = GridSearchCV(SGDClassifier(), params, n_jobs=4)
pre_lc.fit(pre_x_train, pre_y_train)
pre_best_lc = pre_lc.best_estimator_
accuracy_score(pre_best_lc.predict(pre_x_test), pre_y_test)

0.5104551045510455

In [8]:
params = {'max_depth': range(4, 30, 2)}
pre_xgb = GridSearchCV(GradientBoostingClassifier(), params, n_jobs=4)
pre_xgb.fit(pre_x_train, pre_y_train)
pre_best_xgb = pre_xgb.best_estimator_
accuracy_score(pre_best_xgb.predict(pre_x_test), pre_y_test)

0.5257331520683628

In [28]:
# save the model in pickle mode
pre_best_model = pre_best_xgb
with open('models/pre_game_model.pkl', 'wb') as file:
    pickle.dump(pre_best_model, file)

Beginning of the game: 

Predict the game result with ban-pick information and target kills including which team got the first blood, first tower, first Rift Herald, and first dragon, which all usually happen in the first 5 - 10 minutes of a game. 

In [10]:
begin_x = games[['t1_champ1id', 't1_champ1_sum1',
                't1_champ1_sum2', 't1_champ2id', 't1_champ2_sum1', 't1_champ2_sum2',
                't1_champ3id', 't1_champ3_sum1', 't1_champ3_sum2', 't1_champ4id',
                't1_champ4_sum1', 't1_champ4_sum2', 't1_champ5id', 't1_champ5_sum1',
                't1_champ5_sum2', 't1_ban1', 't1_ban2', 't1_ban3',
                't1_ban4', 't1_ban5', 't2_champ1id', 't2_champ1_sum1', 't2_champ1_sum2',
                't2_champ2id', 't2_champ2_sum1', 't2_champ2_sum2', 't2_champ3id',
                't2_champ3_sum1', 't2_champ3_sum2', 't2_champ4id', 't2_champ4_sum1',
                't2_champ4_sum2', 't2_champ5id', 't2_champ5_sum1', 't2_champ5_sum2',
                't2_ban1', 't2_ban2', 't2_ban3', 't2_ban4',
                't2_ban5', 'firstBlood', 'firstTower',
                'firstDragon', 'firstRiftHerald']].copy()

begin_y = games['winner'].copy()
begin_x_train, begin_x_test, begin_y_train, begin_y_test = train_test_split(begin_x, begin_y, test_size=0.3, random_state=42)
len(begin_x_train), len(begin_x_test), len(begin_y_train), len(begin_y_test)

(36043, 15447, 36043, 15447)

In [11]:
params = {'max_depth': range(4, 30, 2)}
begin_dt = GridSearchCV(DecisionTreeClassifier(), params, n_jobs=4)
begin_dt.fit(begin_x_train, begin_y_train)
begin_best_dt = begin_dt.best_estimator_
accuracy_score(begin_best_dt.predict(begin_x_test), begin_y_test)

0.7174855959085906

In [12]:
params = {'max_depth': range(4, 30, 2)}
begin_rf = GridSearchCV(RandomForestClassifier(), params, n_jobs=4)
begin_rf.fit(begin_x_train, begin_y_train)
begin_best_rf = begin_rf.best_estimator_
accuracy_score(begin_best_rf.predict(begin_x_test), begin_y_test)

0.7248656697093286

In [13]:
params = {'loss': ['hinge', 'log'], 'alpha': [0.0001, 0.001, 0.01, 0.1]}
begin_lc = GridSearchCV(SGDClassifier(), params, n_jobs=4)
begin_lc.fit(begin_x_train, begin_y_train)
begin_best_lc = begin_lc.best_estimator_
accuracy_score(begin_best_lc.predict(begin_x_test), begin_y_test)

0.5959733281543341

In [14]:
params = {'max_depth': range(4, 30, 2)}
begin_xgb = GridSearchCV(GradientBoostingClassifier(), params, n_jobs=4)
begin_xgb.fit(begin_x_train, begin_y_train)
begin_best_xgb = begin_xgb.best_estimator_
accuracy_score(begin_best_xgb.predict(begin_x_test), begin_y_test)

0.7271314818411342

In [15]:
# save the model in pickle mode
begin_best_model = begin_best_xgb
with open('models/begin_game_model.pkl', 'wb') as file:
    pickle.dump(begin_best_model, file)

Midgame model:

Mid game usually refers to the game after 20 minutes. At that point, we collect all previously included information, plus number of towers and dragons each team destroyed so far, and which team got the first Baron and first Inhibitor. 

In [16]:
mid_x = games[['t1_champ1id', 't1_champ1_sum1',
            't1_champ1_sum2', 't1_champ2id', 't1_champ2_sum1', 't1_champ2_sum2',
            't1_champ3id', 't1_champ3_sum1', 't1_champ3_sum2', 't1_champ4id',
            't1_champ4_sum1', 't1_champ4_sum2', 't1_champ5id', 't1_champ5_sum1',
            't1_champ5_sum2', 't1_ban1', 't1_ban2', 't1_ban3',
            't1_ban4', 't1_ban5', 't2_champ1id', 't2_champ1_sum1', 't2_champ1_sum2',
            't2_champ2id', 't2_champ2_sum1', 't2_champ2_sum2', 't2_champ3id',
            't2_champ3_sum1', 't2_champ3_sum2', 't2_champ4id', 't2_champ4_sum1',
            't2_champ4_sum2', 't2_champ5id', 't2_champ5_sum1', 't2_champ5_sum2',
            't2_ban1', 't2_ban2', 't2_ban3', 't2_ban4',
            't2_ban5', 'firstBlood', 'firstTower', 'firstInhibitor', 'firstBaron',
            'firstDragon', 'firstRiftHerald', 't1_towerKills', 't1_dragonKills', 
            't1_riftHeraldKills', 't2_towerKills', 't2_dragonKills','t2_riftHeraldKills']].copy()

mid_x[['t1_towerKills', 't1_dragonKills', 't2_towerKills', 't2_dragonKills']] = \
    (mid_x[['t1_towerKills', 't1_dragonKills', 't2_towerKills', 't2_dragonKills']].astype(int) / 2).astype('category')

mid_y = games['winner'].copy()
mid_x_train, mid_x_test, mid_y_train, mid_y_test = train_test_split(mid_x, mid_y, test_size=0.3, random_state=42)
len(mid_x_train), len(mid_x_test), len(mid_y_train), len(mid_y_test)

(36043, 15447, 36043, 15447)

In [17]:
params = {'max_depth': range(4, 30, 2)}
mid_dt = GridSearchCV(DecisionTreeClassifier(), params, n_jobs=4)
mid_dt.fit(mid_x_train, mid_y_train)
mid_best_dt = mid_dt.best_estimator_
accuracy_score(mid_best_dt.predict(mid_x_test), mid_y_test)

0.9617401437172266

In [18]:
params = {'max_depth': range(4, 30, 2)}
mid_rf = GridSearchCV(RandomForestClassifier(), params, n_jobs=4)
mid_rf.fit(mid_x_train, mid_y_train)
mid_best_rf = mid_rf.best_estimator_
accuracy_score(mid_best_rf.predict(mid_x_test), mid_y_test)

0.9653007056386353

In [19]:
params = {'loss': ['hinge', 'log'], 'alpha': [0.0001, 0.001, 0.01, 0.1]}
mid_lc = GridSearchCV(SGDClassifier(), params, n_jobs=4)
mid_lc.fit(mid_x_train, mid_y_train)
mid_best_lc = mid_lc.best_estimator_
accuracy_score(mid_best_lc.predict(mid_x_test), mid_y_test)

0.9451673464103062

In [20]:
params = {'max_depth': range(4, 30, 2)}
mid_xgb = GridSearchCV(GradientBoostingClassifier(), params, n_jobs=4)
mid_xgb.fit(mid_x_train, mid_y_train)
mid_best_xgb = mid_xgb.best_estimator_
accuracy_score(mid_best_xgb.predict(mid_x_test), mid_y_test)

0.9665307179387583

In [21]:
# save the model in pickle mode
mid_best_model = mid_best_xgb
with open('models/mid_game_model.pkl', 'wb') as file:
    pickle.dump(mid_best_model, file)

Lategame model:

Estimate the game result with all information collected as the game is close to the end.

In [22]:
post_game = games.copy()
post_x = post_game.drop(labels='winner', axis=1)
post_y = post_game['winner']
post_x_train, post_x_test, post_y_train, post_y_test = train_test_split(post_x, post_y, test_size=0.3, random_state=42)
len(post_x_train), len(post_x_test), len(post_y_train), len(post_y_test)

(36043, 15447, 36043, 15447)

In [23]:
# use grid search with cv to find optimal parameters
params = {'max_depth': range(4, 30, 2)}
post_dt = GridSearchCV(DecisionTreeClassifier(), params, n_jobs=4)
post_dt.fit(post_x_train, post_y_train)
post_best_dt = post_dt.best_estimator_
# accuracy of prediction
accuracy_score(post_best_dt.predict(post_x_test), post_y_test)

0.9653007056386353

In [24]:
params = {'max_depth': range(4, 30, 2)}
post_rf = GridSearchCV(RandomForestClassifier(), params, n_jobs=4)
post_rf.fit(post_x_train, post_y_train)
post_best_rf = post_rf.best_estimator_
accuracy_score(post_best_rf.predict(post_x_test), post_y_test)

0.9711270796918495

In [25]:
params = {'loss': ['hinge', 'log'], 'alpha': [0.0001, 0.001, 0.01, 0.1]}
post_lc = GridSearchCV(SGDClassifier(), params, n_jobs=4)
post_lc.fit(post_x_train, post_y_train)
post_best_lc = post_lc.best_estimator_
accuracy_score(post_best_lc.predict(post_x_test), post_y_test)

0.9590859066485402

In [47]:
params = {'max_depth': range(4, 30, 2)}
post_xgb = GridSearchCV(GradientBoostingClassifier(), params, n_jobs=4)
post_xgb.fit(post_x_train, post_y_train)
post_best_xgb = post_xgb.best_estimator_
accuracy_score(post_best_xgb.predict(post_x_test), post_y_test)

0.9709976047128892

In [None]:
# save the model in pickle mode
post_best_model = post_best_xgb
with open('models/post_game_model.pkl', 'wb') as file:
    pickle.dump(post_best_model, file)

NameError: name 'post_best_xgb' is not defined