In [1]:
import pandas as pd
import numpy as np
import time
import random
import datetime
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib.pyplot import plot
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time
import datetime

  from numpy.core.umath_tests import inner1d


In [2]:
features = pd.read_csv('./features.csv', index_col='match_id')
features.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


### Подход 1: градиентный бустинг

In [3]:
X = features.loc[:,'start_time':'dire_first_ward_time']
sc = X.count()
print(sc[sc < 97230])
#fill NA with 0
X = X.fillna(0)
y = features['radiant_win']  

first_blood_time               77677
first_blood_team               77677
first_blood_player1            77677
first_blood_player2            53243
radiant_bottle_time            81539
radiant_courier_time           96538
radiant_flying_courier_time    69751
radiant_first_ward_time        95394
dire_bottle_time               81087
dire_courier_time              96554
dire_flying_courier_time       71132
dire_first_ward_time           95404
dtype: int64


In [10]:
#KFOLD
#Random shape/n_rand sample, not to wait long 
n_rand = 10
X_sample = X.sample(int(X.shape[0]))
kf = KFold(n_splits=5, shuffle=True)
X_train = X_test = y_train = y_test = []
for train_index, test_index in kf.split(X_sample):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [6]:
scores = []
for n in range(10,220,10):
    clf = GradientBoostingClassifier(n_estimators=n)
    start_time = datetime.datetime.now()
    clf.fit(X_train, y_train)
    print('Time elapsed:', datetime.datetime.now() - start_time, 'n_estimators:',n)
    #ROC-AUC is increasing with n_estimators, 
    print(clf.predict_proba(X_test)[:, 1])
    cur = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    
    print(cur)
    scores.append(cur)
best_n = (scores.index(max(scores)) + 1) * 10
print('best estimators num: {}, score: {}'.format(best_n, max(scores)))

Time elapsed: 0:00:00.604203 n_estimators: 10
[0.42750858 0.46614396 0.43774122 ... 0.46188732 0.53077001 0.57348569]
0.6695247873742373
Time elapsed: 0:00:01.020989 n_estimators: 20
[0.38779436 0.44398116 0.44542041 ... 0.48267641 0.5519254  0.52852124]
0.6965450629012369
Time elapsed: 0:00:01.304083 n_estimators: 30
[0.39036791 0.38474791 0.4205773  ... 0.49551879 0.5363397  0.49366451]
0.7060403181844646
Time elapsed: 0:00:01.670655 n_estimators: 40
[0.39558711 0.34042    0.3518502  ... 0.455673   0.54941124 0.50586299]
0.7107135863496112
Time elapsed: 0:00:02.085486 n_estimators: 50
[0.42267243 0.30360556 0.35928786 ... 0.46212976 0.53405888 0.48595673]
0.7164321211768379


KeyboardInterrupt: 

### Подход 2: логистическая регрессия

1. Логистическая регрессия с исходными данными

In [11]:
C_param_range = [0.001,0.01,0.1,1,10,100]
scores_logistics = []
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)
for c in C_param_range:    
    clf2 = LogisticRegression(C=c)
    clf2.fit(X_train, y_train)
    cur = roc_auc_score(y_test, clf2.predict_proba(X_test)[:, 1])
    print(cur)
    scores_logistics.append(cur)
best_c = C_param_range[scores_logistics.index(max(scores_logistics))]
print('best c: {}, score: {}'.format(best_c, max(scores_logistics)))

0.7177931869829376
0.7181974007190093
0.7181854014021647
0.7181818079671733
0.7181818503675862
0.7181818397674828
best c: 0.01, score: 0.7181974007190093


2. Логистическая регрессия без категориальных признаков

In [11]:
drop_columns = ['lobby_type','r1_hero','r2_hero','r3_hero', 'r4_hero', 
                'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
X_train_category_out = X_train.drop(drop_columns, axis=1)
X_test_category_out = X_test.drop(drop_columns, axis=1)

In [12]:
scores_logistics = []
for c in C_param_range:
    clf3 = LogisticRegression(penalty='l2', C=c)
    clf3.fit(X_train_category_out, y_train)
    cur = roc_auc_score(y_test, clf3.predict_proba(X_test_category_out)[:, 1])
    print(cur)
    scores_logistics.append(cur)
best_c = C_param_range[scores_logistics.index(max(scores_logistics))]
print('best c: {}, score: {}'.format(best_c, max(scores_logistics)))

0.7428506406558453
0.7461954939745009
0.7445484712644287
0.7430242343090919
0.7426220051125448
0.742563787728834
best c: 0.01, score: 0.7461954939745009


3. Логистическая регрессия с приведенными категориальными признаками к числовым

In [13]:
hero = ['r1_hero','r2_hero','r3_hero', 'r4_hero','r5_hero', 
        'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
print(max(X_train[hero]['r4_hero']))

X_pick_train = np.zeros((X_train.shape[0], 112))

for i, match_id in enumerate(X_train.index):
    for p in range(5):
        X_pick_train[i, X_train.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_train[i, X_train.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

112


In [14]:
X_pick_test = np.zeros((X_test.shape[0], 112))

for i, match_id in enumerate(X_test.index):
    for p in range(5):
        X_pick_test[i, X_test.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_test[i, X_test.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [15]:
d_train = {'r1_hero':X_pick_train[:,0],'r2_hero':X_pick_train[:,1],'r3_hero':X_pick_train[:,2], 'r4_hero':X_pick_train[:,3],'r5_hero':X_pick_train[:,4], 
        'd1_hero':X_pick_train[:,5], 'd2_hero':X_pick_train[:,6], 'd3_hero':X_pick_train[:,7], 'd4_hero':X_pick_train[:,8], 'd5_hero':X_pick_train[:,9]}
d_test = {'r1_hero':X_pick_test[:,0],'r2_hero':X_pick_test[:,1],'r3_hero':X_pick_test[:,2], 'r4_hero':X_pick_test[:,3],'r5_hero':X_pick_test[:,4], 
        'd1_hero':X_pick_test[:,5], 'd2_hero':X_pick_test[:,6], 'd3_hero':X_pick_test[:,7], 'd4_hero':X_pick_test[:,8], 'd5_hero':X_pick_test[:,9]}
bag_to_concat_train = pd.DataFrame(data = d_train, index=X_train_category_out.index)
bag_to_concat_test = pd.DataFrame(data = d_test, index=X_test_category_out.index)

In [16]:
X_train_with_bag = pd.concat([X_train_category_out, bag_to_concat_train], axis=1)
X_test_with_bag = pd.concat([X_test_category_out, bag_to_concat_test], axis=1)
print(X_train_with_bag.isnull().values.any())
print(X_test_with_bag.isnull().values.any())

False
False


In [17]:
scores_logistics = []
for c in C_param_range:
    clf4 = LogisticRegression(penalty='l1', C=c)
    clf4.fit(X_train_with_bag, y_train)
    cur = roc_auc_score(y_test, clf4.predict_proba(X_test_with_bag)[:, 1])
    print(cur)
    scores_logistics.append(cur)
best_c = C_param_range[scores_logistics.index(max(scores_logistics))]
print('best c: {}, score: {}'.format(best_c, max(scores_logistics)))

0.7428051252467623
0.7466040741583618
0.753714004456276
0.7517092094608541
0.7510868127040915
0.7512879273023653
best c: 0.1, score: 0.753714004456276


In [18]:
# X_train_with_bag.shape
X_train_category_out.shape

(7779, 91)

### Итог: Предсказание победы команды Radiant с помощью лучшей из изученных моделей 

In [19]:
features_result = pd.read_csv('./features_test.csv', index_col='match_id')

In [20]:
test = features_result.fillna(0)
drop_columns = ['lobby_type','r1_hero','r2_hero','r3_hero', 'r4_hero', 
                'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
test_category_out = test.drop(drop_columns, axis=1)

In [21]:
pick_test = np.zeros((test.shape[0], 112))

# X_train.loc[match_id, 'r%d_hero' % (p+1)]
for i, match_id in enumerate(test.index):
    for p in range(5):
        pick_test[i, test.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        pick_test[i, test.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [22]:
d_test_1 = {'r1_hero':pick_test[:,0],'r2_hero':pick_test[:,1],'r3_hero':pick_test[:,2], 'r4_hero':pick_test[:,3],'r5_hero':pick_test[:,4], 
        'd1_hero':pick_test[:,5], 'd2_hero':pick_test[:,6], 'd3_hero': pick_test[:,7], 'd4_hero':pick_test[:,8], 'd5_hero':pick_test[:,9]}
bag_to_concat_test_1 = pd.DataFrame(data = d_test_1, index=test_category_out.index)

In [23]:
test_with_bag = pd.concat([test_category_out, bag_to_concat_test_1], axis=1)
print(test_with_bag.isnull().values.any())

False


In [24]:
#best algorithm
result = clf4.predict_proba(test_with_bag)[:, 1]

In [25]:
d_kaggle = {'match_id': test_with_bag.index, 'radiant_win': result}
kaggle = pd.DataFrame(data = d_kaggle)

In [29]:
kaggle.to_csv('dota_for_kaggle_2', index=None)
#result: 0.72273