In [45]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
import operator
import time
import datetime

### Load data

In [46]:
df = pd.read_csv('./features.csv', index_col='match_id')

### Missing data

In [47]:
table = df.count()/df.shape[0]
i = 0
for x in table:
    if x < 1.0: print df.columns.values[i] + " \t-\t " + str(x)
    i = i + 1

first_blood_time 	-	 0.79889951661
first_blood_team 	-	 0.79889951661
first_blood_player1 	-	 0.79889951661
first_blood_player2 	-	 0.547598477836
radiant_bottle_time 	-	 0.838619767561
radiant_courier_time 	-	 0.992882855086
radiant_flying_courier_time 	-	 0.717381466626
radiant_first_ward_time 	-	 0.981116939216
dire_bottle_time 	-	 0.833970996606
dire_courier_time 	-	 0.99304741335
dire_flying_courier_time 	-	 0.731584901779
dire_first_ward_time 	-	 0.981219788131


### Step 1: Gradient Boosting
#### Gradient Boosting preprocessing:

In [63]:
y = df.radiant_win.values[:]
df.fillna(0, inplace=True)
df_GB = df.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis = 1)
X = df_GB.values[:,:]

#### Gradient Boosting Classifier train:

In [49]:
score = {}
kf = KFold(df_GB.shape[0], n_folds=5, shuffle=True, random_state=42)
for N in [10, 20, 30, 40, 50]:
    start_time = datetime.datetime.now()
    score_tmp = []
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = GradientBoostingClassifier(n_estimators=N, random_state=42)
        clf.fit(X_train, y_train.ravel())
        score_tmp.append(roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))
    score[N] = np.array(score_tmp).mean()
    print "n_estimators - ", N, ', train time: ', datetime.datetime.now() - start_time

n_estimators -  10 , train time:  0:01:04.922873
n_estimators -  20 , train time:  0:02:17.809401
n_estimators -  30 , train time:  0:04:02.644742
n_estimators -  40 , train time:  0:05:29.961105
n_estimators -  50 , train time:  0:06:58.649620


#### Gradient Boosting Classifier results:

In [50]:
score_sorted = sorted(score.items(), key=operator.itemgetter(1), reverse=True)
for x in score_sorted:
    print "n_estimators - ", x[0], ', MRA: ', x[1]

n_estimators -  50 , MRA:  0.697494352472
n_estimators -  40 , MRA:  0.694038714979
n_estimators -  30 , MRA:  0.690006465743
n_estimators -  20 , MRA:  0.682461871507
n_estimators -  10 , MRA:  0.664850689035


### Step 2: Logistic Regression with same data
#### Logistic Regression preprocessing

In [51]:
scaler = StandardScaler()
X = scaler.fit_transform(df_GB.values[:,:])

#### Logistic Regression Classifier train:

In [52]:
start_time = datetime.datetime.now()
grid = {'C': np.power(10.0, np.arange(-5, 6))}
kf = KFold(df_GB.shape[0], n_folds=5, shuffle=True, random_state=42)
clf = LogisticRegression(random_state=42)
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
gs.fit(X, y)
print 'Train time: ', datetime.datetime.now() - start_time
res = {}
for a in gs.grid_scores_:
    res[a.parameters['C']] = a.mean_validation_score

Train time:  0:01:47.071483


#### Logistic Regression Classifier results (with same data):

In [53]:
res_sorted = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
for x in res_sorted:
    print x

(0.01, 0.71655027078545253)
(0.10000000000000001, 0.71652715290247604)
(1.0, 0.71652260565844972)
(100000.0, 0.71652232809180305)
(10000.0, 0.71652232809180305)
(1000.0, 0.71652232279685468)
(100.0, 0.71652229525749866)
(10.0, 0.71652228783665373)
(0.001, 0.7163635388802202)
(0.0001, 0.71135774820313558)
(1.0000000000000001e-05, 0.69516168122680189)


### Step 3: Logistic Regression without factor data
#### Logistic Regression preprocessing:

In [54]:
df_LC = df_GB.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1)
scaler = StandardScaler()
X = scaler.fit_transform(df_LC.values[:,:])

#### Logistic Regression Classifier train:

In [55]:
start_time = datetime.datetime.now()
grid = {'C': np.power(10.0, np.arange(-5, 6))}
kf = KFold(df_LC.shape[0], n_folds=5, shuffle=True, random_state=42)
clf = LogisticRegression(random_state=42)
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
gs.fit(X, y)
print 'Train time: ', datetime.datetime.now() - start_time
res = {}
for a in gs.grid_scores_:
    res[a.parameters['C']] = a.mean_validation_score

Train time:  0:01:37.354599


#### Logistic Regression Classifier(without factor data) results:

In [56]:
res_sorted = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
for x in res_sorted:
    print x

(0.01, 0.71655938644491313)
(0.10000000000000001, 0.71653423928740845)
(1.0, 0.71653036557015803)
(10.0, 0.71652997150291353)
(100.0, 0.71652988994720035)
(1000.0, 0.71652988888483093)
(100000.0, 0.71652988358996439)
(10000.0, 0.71652988358996439)
(0.001, 0.71637579990811728)
(0.0001, 0.7113391458629873)
(1.0000000000000001e-05, 0.69510018710653076)


### Step 4: Logistic Regression Classifier with new heroes data
#### Heroes number:

In [57]:
print "Number of playing heroes - ",\
    len(np.unique(df[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].values))
print "Number of unique playing heroes - ",\
    np.max(np.unique(df[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].values))

Number of playing heroes -  108
Number of unique playing heroes -  112


#### DF transformation function:

In [62]:
def dataPick(data):
    N = np.max(np.unique(df[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].values))
    X_pick = np.zeros((data.shape[0], N))

    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    for i, match_id in enumerate(data.index):
        for p in xrange(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    
    res = data.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1)
    
    return np.hstack((res.values[:,:], X_pick))

#### Logistic Regression Classifier preprocessing:

In [70]:
scaler = StandardScaler()
numberOfHeroes = np.max(np.unique(df[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].values))
X = scaler.fit_transform(dataPick(df_GB, numberOfHeroes))

#### Logistic Regression Classifier train:

In [71]:
start_time = datetime.datetime.now()
grid = {'C': np.power(10.0, np.arange(-5, 6))}
kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=42)
clf = LogisticRegression(random_state=42)
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=kf)
gs.fit(X, y)
print 'Train time: ', datetime.datetime.now() - start_time
res = {}
for a in gs.grid_scores_:
    res[a.parameters['C']] = a.mean_validation_score

Train time:  0:02:58.324178


#### Logistic Regression Classifier(with new hereos data) results:

In [72]:
res_sorted = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
for x in res_sorted:
    print x

(0.01, 0.75196404210253798)
(0.10000000000000001, 0.75193003331953845)
(1.0, 0.75192510895506792)
(10.0, 0.75192475304500173)
(100.0, 0.75192460156835972)
(1000.0, 0.75192458674232154)
(10000.0, 0.75192458568265552)
(100000.0, 0.75192458144557917)
(0.001, 0.75161270452605367)
(0.0001, 0.74273874553935604)
(1.0000000000000001e-05, 0.71479182916265049)


### Step 5: Best classifier with best data

In [73]:
clf = LogisticRegression(random_state=42, C=0.01)
clf.fit(X,y)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Test data preprocessing:

In [74]:
df_test = pd.read_csv('./features_test.csv', index_col='match_id')
df_test.fillna(0, inplace=True)
scaler = StandardScaler()
X_test= scaler.fit_transform(dataPick(df_test, numberOfHeroes))

#### Test data predictions:

In [75]:
y_pred = clf.predict_proba(X_test)
df_test['radiant_win'] = y_pred[:, 0]

#### Write answer for Kaggle:

In [76]:
df_to_write = df_test['radiant_win']
df_to_write.to_csv('results.csv', sep=',', encoding='utf-8', header=True)