# Проект: предсказания победителя в онлайн-игре

## Подход 1: градиентный бустинг "в лоб"

### Выполнение

#### 1. Считывание данных

In [1]:
import pandas as pd
df_train = pd.read_csv('task7.1_features.csv', index_col='match_id')
print(f'Shape = {df_train.values.shape}')
df_train.head()

Shape = (97230, 108)


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [2]:
X_train = df_train.loc[:,:'dire_first_ward_time'].copy()
Y_train = df_train.loc[:,'radiant_win'].copy()

#### 2. Проверка пропусков в столбцах

In [3]:
col_null = df_train.isnull().any()
col_null[col_null]

first_blood_time               True
first_blood_team               True
first_blood_player1            True
first_blood_player2            True
radiant_bottle_time            True
radiant_courier_time           True
radiant_flying_courier_time    True
radiant_first_ward_time        True
dire_bottle_time               True
dire_courier_time              True
dire_flying_courier_time       True
dire_first_ward_time           True
dtype: bool

#### 3. Замена пропусков на нули

In [4]:
X_train = X_train.fillna(0)

#### 4. Столбец с целевой переменной

In [5]:
df_train[['radiant_win']].head()

Unnamed: 0_level_0,radiant_win
match_id,Unnamed: 1_level_1
0,1
1,1
2,0
3,0
4,0


#### 5. Обучение и валидация

In [6]:
import datetime
import time

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import KFold, cross_validate

nums = [10, 20, 30, 40, 50, 60]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
clf_gb = dict()
for n in nums:
    clf = GradientBoostingClassifier(n_estimators=n, random_state=42)
    start_time = datetime.datetime.now()
    cv = cross_validate(clf, X_train, Y_train, cv=kf, n_jobs=-1, scoring=make_scorer(roc_auc_score))
    end_time = datetime.datetime.now() - start_time
    print(f"n={n}\ttime={end_time}\tscore={cv['test_score'].mean():.3f}")
    clf_gb[n] = (clf, cv, end_time)

n=10	time=0:00:41.474688	score=0.608
n=20	time=0:01:06.441149	score=0.625
n=30	time=0:01:34.078331	score=0.632
n=40	time=0:01:59.085068	score=0.636
n=50	time=0:02:27.790080	score=0.638
n=60	time=0:02:58.878615	score=0.640


### Отчёт

#### 1. Признаки с пропусками значений
Столбцы с пропусками:

first_blood_time, first_blood_team, first_blood_player1, first_blood_player2, radiant_bottle_time, radiant_courier_time, radiant_flying_courier_time, radiant_first_ward_time, dire_bottle_time, dire_courier_time, dire_flying_courier_time, dire_first_ward_time

first_blood_*: "первая кровь" фиксируется только в первые 5 минут матча. А бывает такое, что это случается позже.

radiant_first_ward_time: "светлые" не устанавливали "наблюдателя" в первые 5 минут некоторых матчей.

#### 2. Столбец с целевой переменной
radiant_win

#### 3. Кросс-валидация
С 30 деревьями кросс-валидация для градиентного бустинга проводилась 1 минута 34 секунды. Метрика AUC-ROC получилась 0.632.

#### 4. Вывод
Да, при увеличении количества деревьев качество растёт. Для скорости можно попробовать уменьшить размерность.

## Подход 2: логистическая регрессия

### Выполнение

#### 1. Обучение и валидация

In [7]:
import datetime
import time

import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [8]:
kf = kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid = {'C': np.power(10.0, np.arange(-5, 6))}

In [9]:
clf_l1 = LogisticRegressionCV(Cs=grid['C'], cv=kf, random_state=42, scoring='roc_auc', n_jobs=-1, penalty='l2')

start_time = datetime.datetime.now()
clf_l1.fit(X_train, Y_train)
end_time = datetime.datetime.now() - start_time

print(f'{end_time}')

0:00:12.919795


In [10]:
clf_l1.predict(X_train)[:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [11]:
clf_l1.scores_[1].mean(axis=0)

array([0.51344762, 0.51344762, 0.51344762, 0.51344762, 0.51344762,
       0.51344762, 0.51344762, 0.51344762, 0.51344762, 0.51344762,
       0.51344762])

#### 2. Удаление категориальных признаков

In [12]:
col_to_remove = ['lobby_type']
r_to_remove = [f'r{i+1}_hero' for i in range(5)]
d_to_remove = [f'd{i+1}_hero' for i in range(5)]

col_to_remove += r_to_remove
col_to_remove += d_to_remove

X_train_clear = X_train.drop(columns=col_to_remove)
scaler = StandardScaler()
X_train_scaled = X_train_clear.copy().astype(float)
X_train_scaled.loc[:,:] = scaler.fit_transform(X_train_scaled)
X_train_scaled.head()

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.544364,1.400808,1.525972,0.734957,0.969743,-0.537757,-0.578083,-0.509023,-0.332256,-0.625222,...,-0.987486,1.066448,-0.041743,-0.262922,0.640648,0.018054,0.562864,-0.551154,1.846004,-1.121494
1,-2.540452,0.501314,-0.080139,-0.24757,-0.246859,-0.537757,1.017574,1.49293,0.578881,0.732454,...,-0.987486,-0.338591,0.578946,-0.262922,0.379585,1.066668,0.562864,0.67817,0.437788,0.043947
2,-2.539231,0.501314,0.15107,0.263085,1.190944,-0.537757,-0.578083,1.49293,-0.332256,0.224676,...,0.391203,-0.823968,-0.824352,0.158654,0.640648,0.018054,0.562864,0.67817,0.437788,0.490286
3,-2.532622,0.501314,0.96295,-0.198013,0.306142,-0.537757,-0.578083,-1.309804,-1.243393,-1.170813,...,-0.987486,-0.594053,0.241615,-0.022021,0.269135,-1.554868,0.562864,-0.551154,-0.970428,0.837439
4,-2.529221,0.501314,0.348745,-0.124754,-0.357459,0.968527,-0.578083,-0.108632,-1.243393,-1.008757,...,-0.987486,1.347455,1.024223,-0.022021,0.680811,1.590976,-0.302485,0.67817,-0.970428,-0.228816


In [13]:
clf_l2 = LogisticRegressionCV(Cs=grid['C'], cv=kf, random_state=42, scoring='roc_auc', n_jobs=-1, penalty='l2')

start_time = datetime.datetime.now()
clf_l2.fit(X_train_scaled, Y_train)
end_time = datetime.datetime.now() - start_time

print(f'{end_time}')
print(f'{clf_l2.C_}')

0:00:30.034782
[0.01]


In [14]:
l_scores = clf_l2.scores_[1].mean(axis=0)
c_i_min, c_i_max = l_scores.argmin(), l_scores.argmax()
print(f'Minimum score={l_scores[c_i_min]:.3f}\tC={grid["C"][c_i_min]}')
print(f'Maximum score={l_scores[c_i_max]:.3f}\tC={grid["C"][c_i_max]}')

Minimum score=0.695	C=1e-05
Maximum score=0.717	C=0.01


#### 3. Количество уникальных героев

In [15]:
heroes = pd.Series()
for i in range(5):
    heroes = pd.Series(sorted(pd.unique(pd.concat([heroes, X_train[f'r{i+1}_hero'], X_train[f'd{i+1}_hero']]))))
print(f'Всего {len(heroes)} уникальных героев в тренировочной выборке')

Всего 108 уникальных героев в тренировочной выборке


#### 4. "Мешок слов"

In [16]:
for i in heroes:
    col_name_hero = f'hero_{i}'
    r_h = (X_train[r_to_remove] == i).T.any()
    d_h = (X_train[d_to_remove] == i).T.any()
    X_train_scaled[col_name_hero] = pd.Series(np.zeros(X_train.shape[0]), index=X_train.index, dtype=np.int64)
    X_train_scaled.loc[r_h,col_name_hero] = 1
    X_train_scaled.loc[d_h,col_name_hero] = -1
X_train_scaled.loc[:,'hero_1':].head()

Unnamed: 0_level_0,hero_1,hero_2,hero_3,hero_4,hero_5,hero_6,hero_7,hero_8,hero_9,hero_10,...,hero_100,hero_101,hero_102,hero_103,hero_104,hero_105,hero_106,hero_109,hero_110,hero_112
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,-1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,-1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 5. Кросс-валидация на новой выборке

In [17]:
clf_l3 = LogisticRegressionCV(Cs=grid['C'], cv=kf, random_state=42, scoring='roc_auc', n_jobs=-1, penalty='l2')

start_time = datetime.datetime.now()
clf_l3.fit(X_train_scaled, Y_train)
end_time = datetime.datetime.now() - start_time

print(f'{end_time}')
print(f'{clf_l3.C_}')

0:01:27.058866
[0.1]


In [18]:
l_scores = clf_l3.scores_[1].mean(axis=0)
c_i_min, c_i_max = l_scores.argmin(), l_scores.argmax()
print(f'Minimum score={l_scores[c_i_min]:.3f}\tC={grid["C"][c_i_min]}')
print(f'Maximum score={l_scores[c_i_max]:.3f}\tC={grid["C"][c_i_max]}')

Minimum score=0.699	C=1e-05
Maximum score=0.752	C=0.1


In [19]:
best_c_scores = clf_l3.scores_[1][:,c_i_max]
print(f'Min score on best C: {best_c_scores.min():.3f}')
print(f'Max score on best C: {best_c_scores.max():.3f}')

Min score on best C: 0.749
Max score on best C: 0.756


#### 6. Предсказание на тестовой выборке

In [20]:
df_test = pd.read_csv('task7.1_features_test.csv', index_col='match_id')
print(f'Shape = {df_test.values.shape}')
df_test.head()

Shape = (17177, 102)


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1430287923,0,93,4,1103,1089,8,0,1,9,...,0,12.0,247.0,-86.0,272.0,3,4,2,0,118.0
7,1430293357,1,20,2,556,570,1,0,0,9,...,2,-29.0,168.0,-54.0,,3,2,2,1,16.0
10,1430301774,1,112,2,751,808,1,0,0,13,...,1,-22.0,46.0,-87.0,186.0,1,3,3,0,-34.0
13,1430323933,1,27,3,708,903,1,1,1,11,...,2,-49.0,30.0,-89.0,210.0,3,4,2,1,-26.0
16,1430331112,1,39,4,1259,661,4,0,0,9,...,0,36.0,180.0,-86.0,180.0,1,3,2,1,-33.0


In [21]:
X_test = df_test.copy()
X_test = X_test.fillna(0)

In [22]:
col_to_remove = ['lobby_type']
r_to_remove = [f'r{i+1}_hero' for i in range(5)]
d_to_remove = [f'd{i+1}_hero' for i in range(5)]

col_to_remove += r_to_remove
col_to_remove += d_to_remove

X_test_clear = X_test.drop(columns=col_to_remove)
scaler = StandardScaler()
X_test_scaled = X_test_clear.copy().astype(float)
X_test_scaled.loc[:,:] = scaler.fit_transform(X_test_scaled)
X_test_scaled.head()

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,-2.514875,0.474746,-0.237743,-0.137094,-0.370005,-0.532732,1.003884,0.305165,-0.368985,0.003979,...,-0.992369,0.450016,1.908352,-0.400363,1.149479,0.017743,0.55426,-0.571552,-0.983356,3.200079
7,-2.513895,-1.36563,-1.207913,-1.25566,-1.14509,-0.532732,-0.5932,0.305165,0.567766,0.023887,...,1.788142,-0.574649,0.836945,1.72308,-1.597294,0.017743,-1.177689,-0.571552,0.433564,0.600867
10,-2.512377,-1.36563,-0.862057,-0.742715,-1.14509,-0.532732,-0.5932,1.898546,-1.305736,-1.375081,...,0.397887,-0.399706,-0.817633,-0.466721,0.281014,-1.038655,-0.311714,0.668978,-0.983356,-0.673257
13,-2.508381,-0.445442,-0.938323,-0.537967,-1.14509,0.968865,1.003884,1.101855,-1.305736,-0.920823,...,1.788142,-1.074486,-1.034626,-0.599436,0.523376,0.017743,0.55426,-0.571552,0.433564,-0.469397
16,-2.507087,0.474746,0.038942,-1.059534,-0.812911,-0.532732,-0.5932,0.305165,1.504517,0.94507,...,-0.992369,1.04982,0.99969,-0.400363,0.220423,-1.038655,-0.311714,-0.571552,0.433564,-0.647774


In [23]:
heroes_test = pd.Series()
for i in range(5):
    heroes_test = pd.Series(sorted(pd.unique(pd.concat([heroes_test, X_test[f'r{i+1}_hero'], X_test[f'd{i+1}_hero']]))))
heroes_test.tail()

103    105
104    106
105    109
106    110
107    112
dtype: int64

In [24]:
for i in heroes_test:
    col_name_hero = f'hero_{i}'
    r_h = (X_test[r_to_remove] == i).T.any()
    d_h = (X_test[d_to_remove] == i).T.any()
    X_test_scaled[col_name_hero] = pd.Series(np.zeros(X_test.shape[0]), index=X_test.index, dtype=np.int64)
    X_test_scaled.loc[r_h,col_name_hero] = 1
    X_test_scaled.loc[d_h,col_name_hero] = -1
X_test_scaled.loc[:,'hero_1':].head()

Unnamed: 0_level_0,hero_1,hero_2,hero_3,hero_4,hero_5,hero_6,hero_7,hero_8,hero_9,hero_10,...,hero_100,hero_101,hero_102,hero_103,hero_104,hero_105,hero_106,hero_109,hero_110,hero_112
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,-1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,1
13,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-1,0
16,0,0,-1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [25]:
final_res = clf_l3.predict_proba(X_test_scaled)

In [26]:
final_res[:,1]

array([0.82454597, 0.75723033, 0.18788479, ..., 0.23400929, 0.62489449,
       0.42709456])

In [27]:
df_res = pd.DataFrame({'radiant_win':final_res[:,1]}, index=X_test.index, copy=True)
df_res.index.name = 'match_id'
df_res.to_csv('final_test.csv')
df_res.head()

Unnamed: 0_level_0,radiant_win
match_id,Unnamed: 1_level_1
6,0.824546
7,0.75723
10,0.187885
13,0.86251
16,0.239961


In [28]:
print(f'Min={df_res.radiant_win.min():.4f}')
print(f'Max={df_res.radiant_win.max():.4f}')

Min=0.0086
Max=0.9965


### Отчёт

#### 1. Обучение и валидация
С категориальными признаками и без скейлинга качество получилось 0.5 -- константный классификатор. Но работает быстрее.

#### 2. Удаление категориальных признаков
После их удаления качество стало 0.717. Это связано с тем, что линейные методы всё воспринимают как числа, что в данном случае не подходит, поэтому пришлось убрать "лишнее".

#### 3. Количество уникальных героев
108 уникальных идентификаторов героев.

#### 4. "Мешок слов"
После добавления "мешка слов" качество стало 0.756, что определённо лучше. Это связано с тем, что для обучения важно, потому что важно с какими героями команда чаще побеждает.

#### 5. Тестовая выборка
Максимальное значение прогноза 0.9965, минимальное -- 0.086. То есть классификатор адекватный и не константный.

## Послесловие
Процессор Intel® Core™ i5-4200U CPU @ 1.60GHz × 4

Интерпретатор Python 3.6.6