# Data Preparation

Read the raw features data.

In [8]:
import pandas
raw_features = pandas.read_csv('./features.csv', index_col='match_id')

raw_features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16,2449,0,4,1974,3,63


Split into features and Y.

In [9]:
Y = raw_features.iloc[:,-5]
features = raw_features.iloc[:,0:-6]
features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,0,35,103,-84,221,3,4,2,2,-52
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,0,-20,149,-84,195,5,4,3,1,-5
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,1,-39,45,-77,221,3,4,3,1,13
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,0,-30,124,-80,184,0,4,2,0,27
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,0,46,182,-80,225,6,3,3,0,-16


In [10]:
features.shape

(97230, 102)

Now find features with the biggest number of empty values.

In [11]:
features.count().nsmallest(15)

first_blood_player2            53243
radiant_flying_courier_time    69751
dire_flying_courier_time       71132
first_blood_time               77677
first_blood_team               77677
first_blood_player1            77677
dire_bottle_time               81087
radiant_bottle_time            81539
radiant_first_ward_time        95394
dire_first_ward_time           95404
radiant_courier_time           96538
dire_courier_time              96554
start_time                     97230
lobby_type                     97230
r1_hero                        97230
dtype: int64

The `first_blood_player2` has many empty values because it is the index of the second participant in the first blood event. Many such events have only one killer. Also, other `first_blood_*` features are also frequently absent because the first blood happened after the first 5 minutes of the game.

The `*_courier_time` features contain empty values because couriers, especially flying, are used either later in the game or not at all.

In [12]:
features.fillna(0, inplace=True)

The target variable is contained in the `radiant_win` column but we already separated it into Y.

# Gradient Boosting
Let us cross-validate gdb classifiers for a series of `n_estimators` values.

In [13]:
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingClassifier

import time
import datetime

kf = KFold(len(Y), 5, shuffle=True)

print "Score for 5 estimators: "
print cross_validation.cross_val_score(GradientBoostingClassifier(n_estimators=5, verbose=False),
                                 features, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

print "Score for 10 estimators: "
print cross_validation.cross_val_score(GradientBoostingClassifier(n_estimators=10, verbose=False),
                                 features, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

print "Score for 20 estimators: "
print cross_validation.cross_val_score(GradientBoostingClassifier(n_estimators=20, verbose=False),
                                 features, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

start_time = datetime.datetime.now()
print "Score for 30 estimators: "
print cross_validation.cross_val_score(GradientBoostingClassifier(n_estimators=30, verbose=False),
                                 features, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()
print 'Cross validation took: ', datetime.datetime.now() - start_time

print "Score for 40 estimators: "
print cross_validation.cross_val_score(GradientBoostingClassifier(n_estimators=40, verbose=False),
                                 features, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

print "Score for 50 estimators: "
print cross_validation.cross_val_score(GradientBoostingClassifier(n_estimators=50, verbose=False),
                                 features, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

Score for 5 estimators: 
0.636713763037
Score for 10 estimators: 
0.664987297487
Score for 20 estimators: 
0.682091950596
Score for 30 estimators: 
0.689763636223
Cross validation took:  0:01:35.573233
Score for 40 estimators: 
0.694200742109
Score for 50 estimators: 
0.697590357868


So, with Gradient Boosting on 30 trees we spent 1 minute 37 seconds on cross-validation. The mean ROC-AUC we ended up with was ~0.6898.

Increasing the number of trees still lends some improvements in the score metric although at great costs. There might be a number of methods to decrease the performance cost such as using subsets of the training set or implementing a simpler version of the trees in the leafs (see MatrixNet). Still, for this particular task we do not need any extreme measures, it is completely feasible for modern hardware.

# Logistic Regression

First attempt is made without any additional preprocessing. The results are pretty bad and the mean score does not change with C. The score is ~0.5134 and it is worse than what we got with gradient boosting. Looking at features we may hypothesize on the reasons. For example, the column with unix timestamp looks like something that should break linear models. On the other hand, the logistic regression is trained faster than gradient boosting based one.

In [14]:
from sklearn.linear_model import LogisticRegression

for c in [0.00001, 0.001, 0.1, 1, 1000]:
    print "C = ", c, "score = ", cross_validation.cross_val_score(LogisticRegression(C=c),
                                 features, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

C =  1e-05 score =  0.513439205165
C =  0.001 score =  0.513439205165
C =  0.1 score =  0.513439205165
C =  1 score =  0.513439205165
C =  1000 score =  0.513439205165


Now, let us scale features with `StandardScaler`.

In [15]:
from sklearn.preprocessing import StandardScaler

features_scaled = StandardScaler().fit_transform(features)

for c in [0.00001, 0.001, 0.007, 0.009, 0.01, 0.02, 0.1, 1, 1000]:
    print "C = ", c, "score = ", cross_validation.cross_val_score(LogisticRegression(C=c),
                                 features_scaled, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

C =  1e-05 score =  0.695047248974
C =  0.001 score =  0.716120783765
C =  0.007 score =  0.716334855591
C =  0.009 score =  0.716331564204
C =  0.01 score =  0.716329402709
C =  0.02 score =  0.716320903531
C =  0.1 score =  0.716310159799
C =  1 score =  0.716307674132
C =  1000 score =  0.716307319379


Looks much more interesting. Max score: 0.716334 for C = 0.007. Let us remove some features altogether. We start with `start_time` just as an experiment.

In [16]:
filtered_features = features.drop(['start_time'], axis=1)

features_scaled = StandardScaler().fit_transform(filtered_features)

import numpy as np

for c in np.arange(0.0045, 0.0055, 0.0001):
    print "C = ", c, "score = ", cross_validation.cross_val_score(LogisticRegression(C=c),
                                 features_scaled, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

C =  0.0045 score =  0.716221581312
C =  0.0046 score =  0.716221695173
C =  0.0047 score =  0.716222170967
C =  0.0048 score =  0.716222291128
C =  0.0049 score =  0.716222912598
C =  0.005 score =  0.716222405203
C =  0.0051 score =  0.716222607243
C =  0.0052 score =  0.716222842359
C =  0.0053 score =  0.716222067245
C =  0.0054 score =  0.71622200561


Now, we remove all features that are really just categories.

In [17]:
np.bincount(features['lobby_type'])

array([12718, 55962,     0,     0,     0,     0,     0, 28550])

In [18]:
filtered_features = features.drop(['lobby_type',
                                   'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                                   'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1)

features_scaled = StandardScaler().fit_transform(filtered_features)

for c in np.arange(0.0044, 0.0055, 0.0002):
    print "C = ", c, "score = ", cross_validation.cross_val_score(LogisticRegression(C=c),
                                 features_scaled, Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

C =  0.0044 score =  0.716384925404
C =  0.0046 score =  0.716384292141
C =  0.0048 score =  0.716385057196
C =  0.005 score =  0.716385242938
C =  0.0052 score =  0.716385190283
C =  0.0054 score =  0.716385514808


The maximum score we reach here is 0.7163852 which looks fine in comparison. Linear models usually have problems deriving coefficients for features that are not really quantitative.

Let us try to still take heroes into account because they should actually matter.

In [127]:
heroes = features[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                   'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]
n_heroes = np.unique(heroes.values).size
print "Unique heroes in the set: ", n_heroes
print "Max hero num: ", np.max(heroes.values)

Unique heroes in the set:  108
Max hero num:  112


We now create artificial features for hero types.

In [104]:
X_pick = np.zeros((features.shape[0], 112))

for i, match_id in enumerate(features.index):
    for p in xrange(5):
        X_pick[i, features.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, features.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
        
X_pick

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.]])

In [98]:
filtered_features = features.drop(['start_time', 'lobby_type',
                                   'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                                   'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1)

features_scaled = StandardScaler().fit_transform(filtered_features)

for c in np.arange(0.01, 0.2, 0.01):
    print "C = ", c, "score = ", cross_validation.cross_val_score(LogisticRegression(C=c),
                                 np.hstack((features_scaled, X_pick)), Y, scoring='roc_auc', n_jobs=-1, cv=kf).mean()

 C =  0.01 score =  0.751439124899
C =  0.02 score =  0.751615948993
C =  0.03 score =  0.75164870469
C =  0.04 score =  0.751656588786
C =  0.05 score =  0.751657995652
C =  0.06 score =  0.751658637959
C =  0.07 score =  0.751656647084
C =  0.08 score =  0.751656202837
C =  0.09 score =  0.751655450562
C =  0.1 score =  0.751653122734
C =  0.11 score =  0.751652312884
C =  0.12 score =  0.751651420426
C =  0.13 score =  0.751650816038
C =  0.14 score =  0.751650187279
C =  0.15 score =  0.751648859203
C =  0.16 score =  0.751647857439
C =  0.17 score =  0.751647887274
C =  0.18 score =  0.75164721156
C =  0.19 score =  0.751647077181


Adding a huge number of features that are derived from the hero identifiers using the "bag of words" method greatly increases the quality. Let's settle on the `C=0.06` (score: 0.7516586) and generate the results for Kaggle submission!

In [105]:
best_clf = LogisticRegression(C=0.06)
best_clf.fit(np.hstack((features_scaled, X_pick)), Y)

LogisticRegression(C=0.06, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

To summarize, the logistic regression based classifier seems to perform better than the naive version of the GdB and also the training is much faster. 

# Processing the Test Set

In [99]:
raw_test = pandas.read_csv('./features_test.csv', index_col='match_id')
raw_test.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12,247,-86,272.0,3,4,2,0,118
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29,168,-54,,3,2,2,1,16
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22,46,-87,186.0,1,3,3,0,-34
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49,30,-89,210.0,3,4,2,1,-26
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36,180,-86,180.0,1,3,2,1,-33


In [108]:
test = raw_test.fillna(0)

test_pick = np.zeros((test.shape[0], 112))

for i, match_id in enumerate(test.index):
    for p in xrange(5):
        test_pick[i, test.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        test_pick[i, test.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

filtered_test = test.drop(['start_time', 'lobby_type',
                                   'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                                   'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1)

test_scaled = StandardScaler().fit_transform(filtered_test)
        
radiant_probs = best_clf.predict_proba(np.hstack((test_scaled, test_pick)))[:, 1]
print np.max(radiant_probs), np.min(radiant_probs)

0.996580326628 0.0084814893426


In [123]:
result = pandas.DataFrame(data=radiant_probs, index=test.index, columns=['radiant_win'])

result.head()

Unnamed: 0_level_0,radiant_win
match_id,Unnamed: 1_level_1
6,0.835269
7,0.776758
10,0.20232
13,0.871738
16,0.261614


In [124]:
result.to_csv('to_kaggle.csv')

This got me instantly into the top 15% in the Kaggle Leaderboard which means I am on the right track. 