#### Prediction of victory based on data from the first 5 minutes of the game

In [2]:
import datetime
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('data/features.zip', index_col='match_id')

In [4]:
features = df[df.columns[:-6]]
# features with missing values:
features.columns[features.count() < features.count().max()]

Index(['first_blood_time', 'first_blood_team', 'first_blood_player1',
       'first_blood_player2', 'radiant_bottle_time', 'radiant_courier_time',
       'radiant_flying_courier_time', 'radiant_first_ward_time',
       'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time',
       'dire_first_ward_time'],
      dtype='object')

In [5]:
features = features.fillna(value=0)
X = features.values
y = df['radiant_win']
kf = KFold(shuffle=True, random_state=0)

In [5]:
# GradientBoosting

scores = {}
initial_time = datetime.datetime.now()
for n in range(10, 50, 10):
    if n == 30:
        start_time = datetime.datetime.now()
    clf = GradientBoostingClassifier(n_estimators=n, random_state=0)
    scores[n] = cross_val_score(clf, X, y, cv=kf, scoring='roc_auc').mean()
    if n == 30:
        print('30 trees. Time elapsed:', datetime.datetime.now() - start_time)

print('Total time elapsed:', datetime.datetime.now() - initial_time)
print('\nQuality estimates:')
scores

30 trees. Time elapsed: 0:05:29.813855
Total time elapsed: 0:18:13.872373

Quality estimates:


{10: 0.6637173759568478,
 20: 0.681722162516617,
 30: 0.688614862673372,
 40: 0.6934400126999827}

1. For features 'first_blood_time' and 'first_blood_team' omissions can mean that the "first blood" event didn't happen in the first 5 minutes.

2. Column 'radiant_win' contains target variable.

3. Cross-validation for gradient boosting with 30 trees took over 5 min at quality 0.6886.

4. When the number of trees increased to 40, the quality increased slightly. To speed up learning when increasing the number of trees, you can either use subset of objects, or decrease the depth of the trees.

In [9]:
# LogisticRegression
# cross-validation with all features

X_lr = StandardScaler().fit_transform(features)
scores = {}
score_time = []
for n in np.geomspace(1e-4, 1e+2, 121):
    start_time = datetime.datetime.now()
    clf = LogisticRegression(C=n, random_state=0)
    scores[n] = cross_val_score(clf, X_lr, y, cv=kf, scoring='roc_auc').mean()
    score_time.append(datetime.datetime.now() - start_time)

In [10]:
# best regularization parameter C, maximum computation time

C = max(scores, key=scores.get)
f'C = {C}, score = {scores[C]}, max running time = {max(score_time)}'

'C = 0.004466835921509635, score = 0.7162726606305722, max running time = 0:00:07.502003'

In [11]:
# cross-validation without categorical features

less_features = features.copy()
less_features.pop('lobby_type')
[less_features.pop(label) for label in features.columns if 'hero' in label]

X_less = StandardScaler().fit_transform(less_features)
scores_less = {}
for n in np.geomspace(1e-4, 1e+2, 121):
    clf = LogisticRegression(C=n, random_state=0)
    scores_less[n] = cross_val_score(clf, X_less, y, cv=kf, scoring='roc_auc').mean()

C = max(scores_less, key=scores_less.get)
f'C = {C}, score = {scores_less[C]}'

'C = 0.004466835921509635, score = 0.7163328498343842'

In [12]:
heros_id = np.unique(features[[label for label in features.columns if 'hero' in label]].values)
f'number of hero IDs: {heros_id.size}, maximum hero id: {heros_id.max()}'

'number of hero IDs: 108, maximum hero id: 112'

In [13]:
# creating "bag of words" features

X_pick = np.zeros((features.shape[0], heros_id.max()))

for i, match_id in enumerate(features.index):
    for p in range(5):
        X_pick[i, features.at[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, features.at[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [14]:
# sample with "bag of words"

data_with_words = pd.concat([less_features, pd.DataFrame(X_pick, index=less_features.index)], axis=1)
X_w = StandardScaler().fit_transform(data_with_words)
scores_w = {}
for n in np.logspace(-2, 2, 20):
    clf = LogisticRegression(C=n, random_state=0)
    scores_w[n] = cross_val_score(clf, X_w, y, cv=kf, scoring='roc_auc').mean()
C = max(scores_w, key=scores_w.get)
f'C = {C}, score = {scores_w[C]}'

'C = 0.01, score = 0.751779840685833'

In [15]:
# test set

df_test = pd.read_csv('data/features_test.zip', index_col='match_id')
df_test = df_test.fillna(0)

X_pick_test = np.zeros((df_test.shape[0], heros_id.max()))
for i, match_id in enumerate(df_test.index):
    for p in range(5):
        X_pick_test[i, df_test.at[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_test[i, df_test.at[match_id, 'd%d_hero' % (p+1)]-1] = -1

df_test.pop('lobby_type')
df_test = df_test.drop([label for label in df_test.columns if 'hero' in label], axis=1)

df_test_words = pd.concat([df_test, pd.DataFrame(X_pick_test, index=df_test.index)], axis=1)

X_test_w = StandardScaler().fit_transform(df_test_words)

clf_test = LogisticRegression(C=0.01)
clf_test.fit(X_w, y)
pred = clf_test.predict_proba(X_test_w)[:, 1]

In [16]:
pred.min(), pred.max()

(0.008721474723098676, 0.9963305890721884)

1. The quality of the logistic regression over all initial features is 0.716, it's a bit more than the gradient boost quality value. The absence of a difference with gradient boosting indicates a linear relationship between features and target variable. Logistic regression is ten times faster.

2. Removing categorical features had almost no effect on the quality of the logistic regression. The new quality is 0.71633. Most likely, these signs do not affect the prediction result in any way.

3. There are 108 different Hero IDs in the game, maximum number of identifiers 112.

4. Quality in the sample when adding a "bag of words" for heroes is higher, than on the original sample, possibly because the merged matrix is sparse.

5. Minimum and maximum value of the forecast on the test set: 0.0087 0.9963