In [1]:
import pandas as pd 
import numpy as np
import datetime
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('features.csv')
train.head()


Unnamed: 0,match_id,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
0,0,1430198770,7,11,5,2098,1489,20,0,0,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1,1430220345,0,42,4,1188,1033,9,0,1,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,2,1430227081,7,33,4,1319,1270,22,0,0,...,4,3,1,13.0,2130,0,0,1830,0,63
3,3,1430263531,1,29,4,1779,1056,14,0,0,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,4,1430282290,7,13,4,1431,1090,8,1,0,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [3]:
train.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], axis = 1, inplace = True)

In [4]:
train.count()

match_id                    97230
start_time                  97230
lobby_type                  97230
r1_hero                     97230
r1_level                    97230
                            ...  
dire_tpscroll_count         97230
dire_boots_count            97230
dire_ward_observer_count    97230
dire_ward_sentry_count      97230
dire_first_ward_time        95404
Length: 103, dtype: int64

In [5]:
len(train)

97230

In [6]:
count_train = len(train) - train.count()
count_train

match_id                       0
start_time                     0
lobby_type                     0
r1_hero                        0
r1_level                       0
                            ... 
dire_tpscroll_count            0
dire_boots_count               0
dire_ward_observer_count       0
dire_ward_sentry_count         0
dire_first_ward_time        1826
Length: 103, dtype: int64

In [7]:
count_train[count_train > 0]

first_blood_time               19553
first_blood_team               19553
first_blood_player1            19553
first_blood_player2            43987
radiant_bottle_time            15691
radiant_courier_time             692
radiant_flying_courier_time    27479
radiant_first_ward_time         1836
dire_bottle_time               16143
dire_courier_time                676
dire_flying_courier_time       26098
dire_first_ward_time            1826
dtype: int64

In [8]:
count_train[count_train > 0] / len(train)

first_blood_time               0.201100
first_blood_team               0.201100
first_blood_player1            0.201100
first_blood_player2            0.452402
radiant_bottle_time            0.161380
radiant_courier_time           0.007117
radiant_flying_courier_time    0.282619
radiant_first_ward_time        0.018883
dire_bottle_time               0.166029
dire_courier_time              0.006953
dire_flying_courier_time       0.268415
dire_first_ward_time           0.018780
dtype: float64

In [9]:
train.fillna(0, inplace = True)

In [10]:
df = pd.read_csv('features.csv')

df.drop([
    "duration",
    "tower_status_radiant",
    "tower_status_dire",
    "barracks_status_radiant",
    "barracks_status_dire",
], axis=1, inplace=True)
df.fillna(0, inplace = True)
y_train = df['radiant_win']
X_train = df.drop('radiant_win', axis = 1)

In [11]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 41)

In [12]:
def score_gradient_boosting(X, y):
    scores = {}
    for n_estimators in [10, 20, 30, 40, 50]:
        print('n_estimators:', n_estimators)
        model = GradientBoostingClassifier(n_estimators = n_estimators, random_state = 41)
        time = datetime.datetime.now()
        score = cross_val_score(model, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1 ).mean()
        print('score:', score)
        print('time:', datetime.datetime.now() - time)
        scores[n_estimators] = score
        print()
    return scores

In [13]:
scores = score_gradient_boosting(X_train, y_train)

n_estimators: 10
score: 0.664753367020111
time: 0:00:17.617309

n_estimators: 20
score: 0.6815665500912005
time: 0:00:35.833966

n_estimators: 30
score: 0.6893728084471435
time: 0:00:51.859868

n_estimators: 40
score: 0.6943741491980353
time: 0:01:17.615588

n_estimators: 50
score: 0.6976973385388637
time: 0:01:34.049048



In [14]:
scaler = StandardScaler()

In [15]:
X_train = pd.DataFrame(scaler.fit_transform(X_train), index = X_train.index, columns = X_train.columns)

In [16]:
def score_log_reg(X: pd.DataFrame, y: pd.Series) -> pd.Series:
    scores = {}
    for i in range(-5, 6):
        C = 10 ** i
        print('C:', C)
        model = LogisticRegression(C = C, random_state = 41, max_iter = 12000000)
        time = datetime.datetime.now()
        score = cross_val_score(model, X, y, cv=cv, scoring="roc_auc", n_jobs=-1).mean()
        print('Score:', score)
        print('Time:',datetime.datetime.now() - time)
        scores[i] = score
        print()
        
    return pd.Series(scores)


In [17]:
scores = score_log_reg(X_train, y_train)

C: 1e-05
Score: 0.695204978290372
Time: 0:00:01.640261

C: 0.0001
Score: 0.7114332189003822
Time: 0:00:01.863140

C: 0.001
Score: 0.7164606345373652
Time: 0:00:02.533501

C: 0.01
Score: 0.7166587562067755
Time: 0:00:03.446247

C: 0.1
Score: 0.7166428458493955
Time: 0:00:04.323219

C: 1
Score: 0.7166400844206592
Time: 0:00:04.177305

C: 10
Score: 0.7166396733632802
Time: 0:00:03.872286

C: 100
Score: 0.7166395843880846
Time: 0:00:03.754729

C: 1000
Score: 0.7166395907499072
Time: 0:00:03.705259

C: 10000
Score: 0.716639592867741
Time: 0:00:03.796684

C: 100000
Score: 0.7166395949872569
Time: 0:00:03.802282



In [18]:
def best_log_reg_score(scores: pd.Series):
    best_iteration = scores.sort_values(ascending = False).head(1)
    best_C = 10 ** best_iteration
    best_score = best_iteration.values[0]
    print('best C:', best_C)
    print('best score:', best_score)

In [19]:
best_log_reg_score(scores)

best C: -2    5.207853
dtype: float64
best score: 0.7166587562067755


In [20]:
X_train.drop(['lobby_type', 
              'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
              'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1, inplace = True)

In [21]:
scores_without_some_features = score_log_reg(X_train, y_train)

C: 1e-05
Score: 0.6951620778147735
Time: 0:00:01.409913

C: 0.0001
Score: 0.7114392994851133
Time: 0:00:01.441915

C: 0.001
Score: 0.716510049190685
Time: 0:00:02.471834

C: 0.01
Score: 0.7167134323617802
Time: 0:00:03.289135

C: 0.1
Score: 0.716692333805533
Time: 0:00:03.413103

C: 1
Score: 0.7166912363572898
Time: 0:00:03.853218

C: 10
Score: 0.7166903509939015
Time: 0:00:03.369991

C: 100
Score: 0.7166904485384038
Time: 0:00:03.325917

C: 1000
Score: 0.7166904400716984
Time: 0:00:03.376000

C: 10000
Score: 0.7166904485517039
Time: 0:00:03.416115

C: 100000
Score: 0.7166904485517039
Time: 0:00:03.495260



In [22]:
best_log_reg_score(scores_without_some_features)

best C: -2    5.208509
dtype: float64
best score: 0.7167134323617802


In [23]:
df_clean = pd.read_csv('features.csv')

In [24]:
hero_col = df_clean[['lobby_type', 
              'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
              'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]

In [25]:
unique_heroes = np.unique(hero_col).ravel()

In [26]:
N_max_heroes = np.max(unique_heroes)

In [27]:
def hero_info(data: pd.DataFrame) -> pd.DataFrame:
    X_all_zeros = np.zeros((data.shape[0], N_max_heroes))
    for i, match_id in enumerate(data.index):
        for h in range(1, 6):
            X_all_zeros[i, data.loc[match_id, f'r{h}_hero'] - 1] = 1
            X_all_zeros[i, data.loc[match_id, f'd{h}_hero'] - 1] = -1
            
    return pd.DataFrame(X_all_zeros, index = data.index, columns = [f'hero_{i}' for i in range(N_max_heroes)])

In [28]:
X_heroes = hero_info(df)

In [29]:
X_train = pd.concat([X_train, X_heroes], axis = 1)

In [30]:
score_log_reg(X_train, y_train)

C: 1e-05
Score: 0.6992470566318151
Time: 0:00:03.224237

C: 0.0001
Score: 0.7251521125111013
Time: 0:00:03.273240

C: 0.001
Score: 0.7464600386033509
Time: 0:00:05.740423

C: 0.01
Score: 0.751934724638014
Time: 0:00:10.679670

C: 0.1
Score: 0.7521748351575398
Time: 0:00:16.243205

C: 1
Score: 0.7521571739850611
Time: 0:00:18.571375

C: 10
Score: 0.7521541247744002
Time: 0:00:16.824251

C: 100
Score: 0.7521547349013306
Time: 0:00:17.571301

C: 1000
Score: 0.7521550523023296
Time: 0:00:17.887327

C: 10000
Score: 0.7521555144282781
Time: 0:00:16.312209

C: 100000
Score: 0.7521547366878061
Time: 0:00:16.408219



-5    0.699247
-4    0.725152
-3    0.746460
-2    0.751935
-1    0.752175
 0    0.752157
 1    0.752154
 2    0.752155
 3    0.752155
 4    0.752156
 5    0.752155
dtype: float64

In [31]:
scores = score_log_reg(X_train, y_train)

C: 1e-05
Score: 0.6992470566318151
Time: 0:00:03.021224

C: 0.0001
Score: 0.7251521125111013
Time: 0:00:04.285317

C: 0.001
Score: 0.7464600386033509
Time: 0:00:06.378471

C: 0.01
Score: 0.751934724638014
Time: 0:00:09.740722

C: 0.1
Score: 0.7521748351575398
Time: 0:00:13.809024

C: 1
Score: 0.7521571739850611
Time: 0:00:15.371141

C: 10
Score: 0.7521541247744002
Time: 0:00:15.223129

C: 100
Score: 0.7521547349013306
Time: 0:00:18.457371

C: 1000
Score: 0.7521550523023296
Time: 0:00:17.101268

C: 10000
Score: 0.7521555144282781
Time: 0:00:18.920404

C: 100000
Score: 0.7521547366878061
Time: 0:00:18.043339



In [32]:
best_log_reg_score(scores)

best C: -1    5.651644
dtype: float64
best score: 0.7521748351575398


In [33]:
model = LogisticRegression(C = 0.1, random_state = 41,  max_iter=1200000)
model.fit(X_train, y_train)

LogisticRegression(C=0.1, max_iter=1200000, random_state=41)

In [40]:

test = pd.read_csv("features_test.csv")
test.fillna(0, inplace=True)
test.index
test.columns


RangeIndex(start=0, stop=17177, step=1)

Index(['match_id', 'start_time', 'lobby_type', 'r1_hero', 'r1_level', 'r1_xp',
       'r1_gold', 'r1_lh', 'r1_kills', 'r1_deaths',
       ...
       'radiant_ward_sentry_count', 'radiant_first_ward_time',
       'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time',
       'dire_tpscroll_count', 'dire_boots_count', 'dire_ward_observer_count',
       'dire_ward_sentry_count', 'dire_first_ward_time'],
      dtype='object', length=103)

In [41]:
X_test = pd.DataFrame(scaler.transform(test), index=test.index, columns=test.columns) 

In [43]:
X_test.drop(['lobby_type', 
              'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 
              'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1, inplace = True)

In [48]:
X_test = pd.concat([X_test, hero_info(test)], axis = 1)

In [53]:
X_te

Unnamed: 0,match_id,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,...,hero_102,hero_103,hero_104,hero_105,hero_106,hero_107,hero_108,hero_109,hero_110,hero_111
0,-1.732345,-2.528200,0.501314,-0.230161,-0.126909,-0.357459,-0.537757,1.017574,0.291758,-0.332256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.732314,-2.527214,-1.297676,-1.195592,-1.245180,-1.131661,-0.537757,-0.578083,0.291758,0.578881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.732223,-2.525688,-1.297676,-0.851426,-0.732370,-1.131661,-0.537757,-0.578083,1.893320,-1.243393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.732132,-2.521671,-0.398181,-0.927319,-0.527677,-1.131661,0.968527,1.017574,1.092539,-1.243393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
4,-1.732042,-2.520369,0.501314,0.045173,-1.049106,-0.799860,-0.537757,-0.578083,0.291758,1.490017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17172,1.732471,1.084410,1.400808,1.448314,1.708865,1.743945,-0.537757,1.017574,-0.108632,0.578881,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17173,1.732713,1.086240,-0.398181,-0.856720,-1.169767,-1.131661,-0.537757,-0.578083,1.492930,-0.332256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
17174,1.732743,1.086370,-1.297676,-1.162058,-1.398161,-1.242261,-0.537757,-0.578083,-0.108632,-0.332256,...,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
17175,1.733198,1.090210,0.501314,1.077673,0.060547,-0.357459,0.968527,2.613231,-0.108632,-0.332256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0


In [55]:
prediction = pd.Series(model.predict_proba(X_test)[:, 1])

In [56]:
prediction

0        0.831572
1        0.759446
2        0.193933
3        0.867119
4        0.248067
           ...   
17172    0.731003
17173    0.643419
17174    0.246023
17175    0.636276
17176    0.439312
Length: 17177, dtype: float64

In [57]:
predic

count    17177.000000
mean         0.517105
std          0.220996
min          0.008320
25%          0.345808
50%          0.522786
75%          0.691744
max          0.996488
dtype: float64