In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from plotly import tools
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import os

In [2]:
PATH_TO_DATA = '../../data/dota_2/'
SEED = 17

In [3]:
df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                             'train_features.csv'), 
                                    index_col='match_id_hash')
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            'train_targets.csv'), 
                                   index_col='match_id_hash')

In [8]:
df_train_features.head(5)

Unnamed: 0_level_0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,...,d5_stuns,d5_creeps_stacked,d5_camps_stacked,d5_rune_pickups,d5_firstblood_claimed,d5_teamfight_participation,d5_towers_killed,d5_roshans_killed,d5_obs_placed,d5_sen_placed
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,155,22,7,1,11,11,0,0,0,0,...,0.0,0,0,0,0,0.0,0,0,0,0
b9c57c450ce74a2af79c9ce96fac144d,658,4,0,3,10,15,7,2,0,7,...,0.0,0,0,0,0,0.0,0,0,0,0
6db558535151ea18ca70a6892197db41,21,23,0,0,0,101,0,0,0,0,...,0.0,0,0,0,0,0.0,0,0,0,0
46a0ddce8f7ed2a8d9bd5edcbb925682,576,22,7,1,4,14,1,0,3,1,...,8.664527,3,1,3,0,0.0,0,0,2,0
b1b35ff97723d9b7ade1c9c3cf48f770,453,22,7,1,3,42,0,1,1,0,...,0.0,2,1,2,0,0.25,0,0,0,0


In [9]:
from sklearn.model_selection import ShuffleSplit, KFold
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)

In [10]:
X = df_train_features.values
y = df_train_targets['radiant_win'].values

In [11]:
#%%time

model_rf = RandomForestClassifier(n_estimators = 100, n_jobs = -1, max_depth = None, random_state = SEED)

# calcuate ROC-AUC for each split
cv_scores_rf = cross_val_score(model_rf, X, y, cv=cv, scoring='roc_auc')

In [12]:
print(cv_scores_rf)
print(cv_scores_rf.mean())

[0.77723183 0.78656801 0.78004904 0.77732062 0.77669984]
0.7795738699757455


In [13]:
df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                             'test_features.csv'), 
                                    index_col='match_id_hash')

In [14]:
model_rf.fit(X, y)
#df_submission_base = pd.DataFrame(
#    {'radiant_win_prob': model_rf.predict_proba(df_test_features.values)[:, 1]}, 
#    index=df_test_features.index,)
#
#df_submission_base.to_csv('submission_base_rf.csv')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [16]:
all_feature_names = list(df_train_features.columns)

In [21]:
model_rf.fit(df_train_features, df_train_targets['radiant_win'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [22]:
import eli5
from IPython.display import display_html

In [24]:
display_html(eli5.show_weights(estimator = model_rf, 
                  feature_names=all_feature_names, top=30))

Weight,Feature
0.0134  ± 0.0193,r2_y
0.0121  ± 0.0204,r1_y
0.0121  ± 0.0173,d1_x
0.0113  ± 0.0184,r3_y
0.0112  ± 0.0151,d1_y
0.0111  ± 0.0160,r5_y
0.0110  ± 0.0140,d3_y
0.0110  ± 0.0145,r4_y
0.0109  ± 0.0149,d5_y
0.0109  ± 0.0152,r5_x
