In [1]:
import pandas as pd

%matplotlib inline

In [2]:
players = pd.read_csv('players.csv')
matches = pd.read_csv('match.csv')
heroes = pd.read_csv('hero_names.csv')
items = pd.read_csv('item_ids.csv')
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189 entries, 0 to 188
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   item_id    189 non-null    int64 
 1   item_name  189 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.1+ KB


In [3]:
hero_lookup = dict(zip(heroes['hero_id'], heroes['localized_name']))
hero_lookup[0] = 'Unknown'
players['hero'] = players['hero_id'].apply(lambda _id: hero_lookup[_id])

In [4]:
item_lookup = dict(zip(items['item_id'], items['item_name']))
item_lookup[0] = 'Unknown'

def find_item(_id):
    return item_lookup.get(_id, 'u_' + str(_id))

players['item_0'] = players['item_0'].apply(find_item)
players['item_1'] = players['item_1'].apply(find_item)
players['item_2'] = players['item_2'].apply(find_item)
players['item_3'] = players['item_3'].apply(find_item)
players['item_4'] = players['item_4'].apply(find_item)
players['item_5'] = players['item_5'].apply(find_item)

In [5]:
player_heroes = pd.get_dummies(players['hero'])
player_heroes

Unnamed: 0,Abaddon,Alchemist,Ancient Apparition,Anti-Mage,Axe,Bane,Batrider,Beastmaster,Bloodseeker,Bounty Hunter,...,Venomancer,Viper,Visage,Warlock,Weaver,Windranger,Winter Wyvern,Witch Doctor,Wraith King,Zeus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
499996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
499997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
499998,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
item0 = pd.get_dummies(players['item_0'].fillna(0))
item1 = pd.get_dummies(players['item_1'].fillna(0))
item2 = pd.get_dummies(players['item_2'].fillna(0))
item3 = pd.get_dummies(players['item_3'].fillna(0))
item4 = pd.get_dummies(players['item_4'].fillna(0))
item5 = pd.get_dummies(players['item_5'].fillna(0))

In [7]:
player_items = item0 \
    .add(item1, fill_value=0) \
    .add(item2, fill_value=0) \
    .add(item3, fill_value=0) \
    .add(item4, fill_value=0) \
    .add(item5, fill_value=0)


In [8]:
radiant_cols = list(map(lambda s: 'radiant_' + s, player_heroes.columns.values))
dire_cols = list(map(lambda s: 'dire_' + s, player_heroes.columns.values))

In [9]:
radiant_items_cols = list(map(lambda s: 'radiant_' + str(s), player_items.columns.values))
dire_items_cols = list(map(lambda s: 'dire_' + str(s), player_items.columns.values))

In [10]:
from os.path import isfile

X = None

radiant_heroes = []
dire_heroes = []
radiant_items = []
dire_items = []

for _id, _index in players.groupby('match_id').groups.items():
    radiant_heroes.append(player_heroes.iloc[_index][:5].sum().values)
    dire_heroes.append(player_heroes.iloc[_index][5:].sum().values)
    radiant_items.append(player_items.iloc[_index][:5].sum().values)
    dire_items.append(player_items.iloc[_index][5:].sum().values)

radiant_heroes = pd.DataFrame(radiant_heroes, columns=radiant_cols)
dire_heroes = pd.DataFrame(dire_heroes, columns=dire_cols)
radiant_items = pd.DataFrame(radiant_items, columns=radiant_items_cols)
dire_items = pd.DataFrame(dire_items, columns=dire_items_cols)
X = pd.concat([radiant_heroes, dire_heroes], axis=1)
X_full = pd.concat([radiant_heroes, radiant_items, dire_heroes, dire_items], axis=1)
# X.to_csv('mapped_match_hero_item.csv', index=False)

In [30]:
X.head()

Unnamed: 0,radiant_Abaddon,radiant_Alchemist,radiant_Ancient Apparition,radiant_Anti-Mage,radiant_Axe,radiant_Bane,radiant_Batrider,radiant_Beastmaster,radiant_Bloodseeker,radiant_Bounty Hunter,...,dire_Venomancer,dire_Viper,dire_Visage,dire_Warlock,dire_Weaver,dire_Windranger,dire_Winter Wyvern,dire_Witch Doctor,dire_Wraith King,dire_Zeus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [12]:
y = matches['radiant_win'].apply(lambda win: 1 if win else 0)
classes = ['Dire Win', 'Radiant Win']

In [13]:
_ = pd.Series(y).apply(lambda i: classes[i]).value_counts()

In [77]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.1)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(45000, 222) (45000,)
(5000, 222) (5000,)


In [69]:
import io
from scipy import misc
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

In [71]:
#Models

DTC = DecisionTreeClassifier()
LR = LogisticRegression()
SGD = SGDClassifier()
SVC = LinearSVC()
GNB = GaussianNB()
XGB = XGBClassifier()
RFC = RandomForestClassifier()

# Decision Tree

In [108]:
param_grid = {"max_depth":[10,20,30], "min_samples_leaf":[1,2,3]}

gcv = GridSearchCV(DecisionTreeClassifier(),param_grid,cv=5)
gcv.fit(X_train, y_train)

print(gcv.best_params_)
y_pred = gcv.best_estimator_.predict(X_test)
print('Accuracy ',accuracy_score(y_test, y_pred))
print('Precision ',precision_score(y_test, y_pred))
print('Recall ',recall_score(y_test, y_pred))

{'max_depth': 20, 'min_samples_leaf': 1}
Accuracy  0.5576
Precision  0.5714743589743589
Recall  0.6708051166290444


# Logistic Regression

In [107]:
param_grid = {"C":[1,5,10]}

gcv = GridSearchCV(LogisticRegression(),param_grid,cv=3)
gcv.fit(X_train, y_train)

print(gcv.best_params_)
y_pred = gcv.best_estimator_.predict(X_test)
print('Accuracy ',accuracy_score(y_test, y_pred))
print('Precision ',precision_score(y_test, y_pred))
print('Recall ',recall_score(y_test, y_pred))

{'C': 5}
Accuracy  0.5948
Precision  0.6127765881513205
Recall  0.6459744168547781


# SVM using Stochastic Gradient Descent 

In [106]:
param_grid = {"loss" :["hinge","log"], "alpha":[0.0001, 0.001, 0.005]}

gcv = GridSearchCV(SGDClassifier(),param_grid,cv=3)
gcv.fit(X_train, y_train)

print(gcv.best_params_)
y_pred = gcv.best_estimator_.predict(X_test)
print('Accuracy ',accuracy_score(y_test, y_pred))
print('Precision ',precision_score(y_test, y_pred))
print('Recall ',recall_score(y_test, y_pred))

{'alpha': 0.001, 'loss': 'hinge'}
Accuracy  0.5964
Precision  0.6204819277108434
Recall  0.6200150489089541


# Linear Support Vector Classification

In [104]:
param_grid = {"loss" :["hinge","squared_hinge"], "C":[1, 5, 10]}

gcv = GridSearchCV(LinearSVC(),param_grid,cv=5)
gcv.fit(X_train, y_train)

print(gcv.best_params_)
y_pred = gcv.best_estimator_.predict(X_test)




{'C': 1, 'loss': 'squared_hinge'}


In [105]:
print('Accuracy ',accuracy_score(y_test, y_pred))
print('Precision ',precision_score(y_test, y_pred))
print('Recall ',recall_score(y_test, y_pred))

Accuracy  0.5946
Precision  0.6127188281529118
Recall  0.6452219714070729


# Gaussian Naive Bayes

In [103]:
param_grid = {"var_smoothing" :[1e-9, 1e-5, 1, 10, 100]}

gcv = GridSearchCV(GaussianNB(),param_grid,cv=5)
gcv.fit(X_train, y_train)

print(gcv.best_params_)
y_pred = gcv.best_estimator_.predict(X_test)
print('Accuracy ',accuracy_score(y_test, y_pred))
print('Precision ',precision_score(y_test, y_pred))
print('Recall ',recall_score(y_test, y_pred))

{'var_smoothing': 1}
Accuracy  0.5988
Precision  0.6069553805774278
Recall  0.6960120391271633


# XGB Classifier

In [97]:
gcv = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
gcv.fit(X_train, y_train)

y_pred = gcv.predict(X_test)
accuracy_score(y_test, y_pred)

0.5924

In [98]:
gcv

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [99]:
print('Accuracy ',accuracy_score(y_test, y_pred))
print('Precision ',precision_score(y_test, y_pred))
print('Recall ',recall_score(y_test, y_pred))

Accuracy  0.5924
Precision  0.6115107913669064
Recall  0.6395786305492852


# Random Forest

In [100]:
param_grid = {"n_estimators" :[100, 200], "max_depth": [None, 8, 16]}

gcv = GridSearchCV(RandomForestClassifier(),param_grid,cv=5)
gcv.fit(X_train, y_train)

print(gcv.best_params_)
y_pred = gcv.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred)

{'max_depth': 16, 'n_estimators': 200}


0.5836

In [101]:
RFC = RandomForestClassifier(n_estimators=500, max_depth=32)
RFC.fit(X_train, y_train)


RandomForestClassifier(max_depth=32, n_estimators=500)

In [102]:
y_pred = RFC.predict(X_test)
print('Accuracy ',accuracy_score(y_test, y_pred))
print('Precision ',precision_score(y_test, y_pred))
print('Recall ',recall_score(y_test, y_pred))


Accuracy  0.5934
Precision  0.6026272577996716
Recall  0.6903686982693755


# Results

<img src="mlresults-heroes.png">