In [2]:
import numpy as np
import pandas as pd

### Композиции и ансамблирование. Введение.

### Задача.
Имеется 3 классификатора:
$$C_1, C_2, C_3$$
$$Accuracy_{(C_1)} = 0.7$$ 
$$Accuracy_{(C_2)} = 0.7$$ 
$$Accuracy_{(C_1)} = 0.7$$

Составим классификатор $C_4$ по принципу голосования (тот класс, который набрал больше голосов, идет в финальное предсказание)

Вопрос.

*Какой accuracy у $C_4$, если известно что классификаторы (базовые алгоритмы) не коррелируют?*

Событие когда классификатор $C_1$ угадал правильно обозначим за $P(C_1)$.

Тогда $P(C_1) = 0.7$. Аналогично $P(C_2) = 0.7, P(C_3) = 0.7$

Рассмотрим полную группу событий.

1. Все 3 классификатра угадали ответ правильно. - Событие X
2. 1 угадал правильно, 2 угадали неправильно. - Событие Y 
3. 2 угадали правильно, 1 угадал неправильно. -Событие Z 
4. Все 3 классификатора угадали неправильно. - Событие K

1. $P(X) = 0.7 * 0.7 * 0.7$
2. $P(Y) = 0.7 * 0.3 * 0.3$
3. $P(Z) = 0.7 * 0.7 * 0.3$
4. $P(K) = 0.3 * 0.3 * 0.3$

In [4]:
0.7*0.7*0.7 + 3*0.3*0.7*0.7

0.7839999999999998

Теперь возьмем 23 классификатора с такой же точностью $0.7$.

Чему будет равна итоговая точность $C_{24}$?

In [6]:
import itertools as it
def compute_ensemble_accuracy(n, accuracy):
    probability = 0
    edge = int(np.ceil(n / 2))
    clfs = [accuracy]*n
    for i in range(edge, n + 1):
        permuted_clfs = len(list(it.combinations(clfs, i)))
        probability_i = permuted_clfs * (accuracy ** i) * ( (1 - accuracy) ** (n - i))
        print ('Permutations: {}, Right: {}, Wrong: {}, Probability: {}'.format(permuted_clfs, i, n - i,  probability_i))
        probability += permuted_clfs * (accuracy ** i) * ( (1 - accuracy) ** (n - i) )
    return probability

In [7]:
compute_ensemble_accuracy(23, 0.7)

Permutations: 1352078, Right: 12, Wrong: 11, Probability: 0.03315217516646879
Permutations: 1144066, Right: 13, Wrong: 10, Probability: 0.06545429455943835
Permutations: 817190, Right: 14, Wrong: 9, Probability: 0.10909049093239724
Permutations: 490314, Right: 15, Wrong: 8, Probability: 0.15272668730535607
Permutations: 245157, Right: 16, Wrong: 7, Probability: 0.17818113518958206
Permutations: 100947, Right: 17, Wrong: 6, Probability: 0.1711936396919514
Permutations: 33649, Right: 18, Wrong: 5, Probability: 0.13315060864929545
Permutations: 8855, Right: 19, Wrong: 4, Probability: 0.0817591456618481
Permutations: 1771, Right: 20, Wrong: 3, Probability: 0.0381542679755291
Permutations: 253, Right: 21, Wrong: 2, Probability: 0.012718089325176363
Permutations: 23, Right: 22, Wrong: 1, Probability: 0.0026977765235222584
Permutations: 1, Right: 23, Wrong: 0, Probability: 0.0002736874734008088


0.978551998453966

Таким образом мы получаем Accuracy = 0.931.

Также этот эффект называется "мудростью толпы".

Теорема Кондорсе, парадокс выбора.

### Стэкинг своими руками

Стэкинг делается следующим образом: берется несколько алгоритмов(случайный лес, линейная регрессия, решающее дерево) и обучается на обучающей выборке. Далее прогнозы этих алгоритмов подаются на вход другого алгоритма(случайный лес, линейная регрессия, решающее дерево), делается предсказание и ответы этого другого алгоритма подаются на вход еще одного алгоритма(и так можно делать до бесконечности).
Однако есть очень сильный шанс переобучиться.


Воспользуемся статьей Александра Дьяконова: https://alexanderdyakonov.wordpress.com/2017/03/10/c%D1%82%D0%B5%D0%BA%D0%B8%D0%BD%D0%B3-stacking-%D0%B8-%D0%B1%D0%BB%D0%B5%D0%BD%D0%B4%D0%B8%D0%BD%D0%B3-blending/

<img src='stacking-2b.png'>

In [29]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import BaggingClassifier

In [30]:
train_matches = pd.read_csv('train.csv')
test_matches = pd.read_csv('test.csv')
train = train_matches.copy()
train = train.drop('radiant_won', axis=1)
test = test_matches.copy()

In [31]:
heroes = pd.read_csv('heroes.csv')
heroes_list = ['player_{}'.format(i) for i in range(10)]

train = pd.merge(train, heroes, on='mid', how='left')
test = pd.merge(test, heroes, on='mid', how='left')

In [32]:
train[heroes_list].apply(lambda x: x.unique().shape[0], axis = 0)

player_0    111
player_1    110
player_2    111
player_3    111
player_4    111
player_5    111
player_6    111
player_7    111
player_8    111
player_9    111
dtype: int64

In [33]:
test[heroes_list].apply(lambda x: x.unique().shape[0], axis = 0)

player_0    111
player_1    111
player_2    111
player_3    110
player_4    111
player_5    110
player_6    110
player_7    111
player_8    111
player_9    111
dtype: int64

In [34]:
keys = np.unique(train[heroes_list[1:]].values)
X_pick = np.zeros((train.shape[0], 111))
for i, match_id in enumerate(train.index):
    for p in range(5):
        key = train.ix[match_id, 'player_{}'.format(p)]
        X_pick[i, np.where(keys==key)[0][0]] = 1
        
        key = train.ix[match_id, 'player_{}'.format(p+5)]
        X_pick[i, np.where(keys==key)[0][0]] = -1

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


In [35]:
keys = np.unique(test[heroes_list[1:]].values)
X_pick_test = np.zeros((test.shape[0], 111))
for i, match_id in enumerate(test.index):
    for p in range(5):
        key = test.ix[match_id, 'player_{}'.format(p)]
        X_pick_test[i, np.where(keys==key)[0][0]] = 1
        
        key = test.ix[match_id, 'player_{}'.format(p+5)]
        X_pick_test[i, np.where(keys==key)[0][0]] = -1
        

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


In [36]:
train = train.drop(heroes_list, axis=1)
test = test.drop(heroes_list, axis=1)

In [37]:
def get_features_by_time(data, timestamp, name, filter_list=None):
    new_data = data[data.times==timestamp]
    new_data = new_data.drop('times', axis = 1)
    
    radiant_data = new_data[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].sum(axis=1)
    dire_data = new_data[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].sum(axis=1)
    
    radiant_data_mean = new_data[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].mean(axis=1)
    dire_data_mean = new_data[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].mean(axis=1)
    
    radiant_data_std = new_data[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].std(axis=1)
    dire_data_std = new_data[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].std(axis=1)
    
    new_data['radiant_data'] = radiant_data
    new_data['dire_data'] = dire_data
    
    new_data['radiant_data_mean'] = radiant_data_mean
    new_data['dire_data_mean'] = dire_data_mean
    
    new_data['radiant_data_std'] = radiant_data_std
    new_data['dire_data_std'] = dire_data_std
    
    new_data['diff_data'] = new_data['radiant_data'] - new_data['dire_data']
    new_data['ratio_data'] = new_data['radiant_data'] / new_data['dire_data']
    
    new_data['diff_data_mean'] = new_data['radiant_data_mean'] - new_data['dire_data_mean']
    new_data['diff_data_std'] = new_data['radiant_data_std'] - new_data['dire_data_std']
    
    new_data.rename(columns=lambda x: x + '_'+str(name), inplace=True)
    new_data.rename(columns={'mid'+'_'+str(name):'mid'}, inplace=True)
    
    if (filter_list==None):
        return new_data
    filter_list = [x + '_'+str(name) if x !='mid' else x for x in filter_list ]
    return new_data[filter_list]

def merge_features(data, features):
    return pd.merge(data, features, on='mid', how='left')

In [38]:
filterNone = None
filter1 = ['radiant_data','dire_data','diff_data','ratio_data','mid']
filter2 = ['radiant_data','mid']
filter3 = ['diff_data','ratio_data','mid']
filter4 = ['ratio_data','mid']

In [39]:
gold = pd.read_csv('gold.csv')
lh = pd.read_csv('lh.csv')
xp = pd.read_csv('xp.csv')

In [40]:
active_filter = filterNone
timesstamps = [600]#[60,120,180,240,300,360,420,480,540,600]
for t in timesstamps:    
    gold_features = get_features_by_time(gold, t, 'gold', active_filter)
    lh_features = get_features_by_time(lh, t, 'lh', active_filter)
    xp_features = get_features_by_time(xp, t, 'xp', active_filter)
    
    train = merge_features(train, gold_features)
    train = merge_features(train, lh_features)
    train = merge_features(train, xp_features)
    
    test = merge_features(test, gold_features)
    test = merge_features(test, lh_features)
    test = merge_features(test, xp_features)

In [41]:
timesstamps = [60,120,180,240,300,360,420,480,540,600]
for t in timesstamps:    
    gold_features = get_features_by_time(gold, t, 'gold', ['radiant_data','dire_data','mid'])
    
    gold_features.rename(columns=lambda x: x + '_'+str(t), inplace=True)
    gold_features.rename(columns={'mid'+'_'+str(t):'mid'}, inplace=True)
    
    train = merge_features(train, gold_features)
    test = merge_features(test, gold_features)

In [42]:
train.head()

Unnamed: 0,mid,player_0_gold,player_1_gold,player_2_gold,player_3_gold,player_4_gold,player_5_gold,player_6_gold,player_7_gold,player_8_gold,...,radiant_data_gold_360,dire_data_gold_360,radiant_data_gold_420,dire_data_gold_420,radiant_data_gold_480,dire_data_gold_480,radiant_data_gold_540,dire_data_gold_540,radiant_data_gold_600,dire_data_gold_600
0,0,3454,5206,2613,4426,5755,4072,3997,5917,1725,...,11627,13323,13499,15333,17209,17605,19653,19787,21454,22095
1,1,2477,5760,3816,4353,5759,7659,5066,2748,4440,...,12234,13640,14879,16494,17669,18691,20039,21734,22165,24536
2,2,3604,1948,8581,4390,2869,3096,2301,5130,2530,...,12418,9386,14482,10755,16601,12156,18428,14495,21392,15548
3,4,3675,4103,5154,3030,2076,3920,3494,3392,4458,...,9645,11517,11999,12680,13822,14971,15713,16665,18038,17484
4,5,4252,2412,2545,4264,2544,4752,5389,4954,3954,...,9513,11903,11434,14350,12643,16134,14361,19035,16017,22041


In [43]:
radiant_diff = ['radiant_data_gold_60','radiant_data_gold_120','radiant_data_gold_180','radiant_data_gold_240','radiant_data_gold_300','radiant_data_gold_360','radiant_data_gold_420','radiant_data_gold_480','radiant_data_gold_540','radiant_data_gold_600']
radiant_data = np.gradient(train[radiant_diff],axis=1)

In [44]:
dire_diff = ['dire_data_gold_60','dire_data_gold_120','dire_data_gold_180','dire_data_gold_240','dire_data_gold_300','dire_data_gold_360','dire_data_gold_420','dire_data_gold_480','dire_data_gold_540','dire_data_gold_600']
dire_data = np.gradient(train[dire_diff],axis=1)

In [45]:
train = train.drop(radiant_diff,axis=1)
train = train.drop(dire_diff,axis=1)

In [46]:
def sort_quantity_feature(data, ls1, ls2, drop=False):
    new_ls1 = [x+str('_sorted') for x in ls1]
    new_ls2 = [x+str('_sorted') for x in ls2]
    
    for match_id in data.index:
        key = data.loc[match_id]
        s = sorted(key[ls1])
        for i, el in enumerate(s):
            data.loc[match_id,new_ls1[i]] = s[i]
            
        s = sorted(key[ls2])
        for i, el in enumerate(s):
            data.loc[match_id,new_ls2[i]] = s[i]
    if drop:
        data = data.drop(ls1,axis=1)
        data = data.drop(ls2,axis=1)

In [49]:
train_full = pd.read_csv('train10.csv')
train_full = train_full.drop('Unnamed: 0',axis=1)

In [50]:
train_full.head()

Unnamed: 0,mid,radiant_data_gold,dire_data_gold,diff_data_gold,ratio_data_gold,radiant_data_lh,dire_data_lh,diff_data_lh,ratio_data_lh,radiant_data_xp,...,item_111,item_112,item_113,item_114,item_115,item_116,item_117,item_118,item_119,item_120
0,0,21454,22095,-641,0.970989,148,192,-44,0.770833,15856,...,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
1,1,22165,24536,-2371,0.903366,157,144,13,1.090278,15231,...,0.0,-1.0,0.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0
2,2,21392,15548,5844,1.375868,174,99,75,1.757576,18003,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,18038,17484,554,1.031686,143,101,42,1.415842,15334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,16017,22041,-6024,0.726691,96,145,-49,0.662069,11391,...,0.0,1.0,-1.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0


In [51]:
x_train = train_full.values
y_train = train_matches.radiant_won.values

In [52]:
clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
ls = cross_val_score(clf2, x_train, y_train, cv=5, scoring='roc_auc')
print(ls)
np.mean(ls)

[ 0.70540149  0.70617726  0.69778468  0.69231158  0.69818708]


0.69997241968521795

In [53]:
clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1),
        LogisticRegression(C=1000,class_weight=None,tol=1e-05, n_jobs=-1, penalty='l2'),
        LogisticRegression(C=1000,class_weight=None,tol=1e-05, n_jobs=-1, penalty='l1')]

In [54]:
dataset_blend_train = np.zeros((x_train.shape[0], len(clfs)))

In [55]:
for j, clf in enumerate(clfs):
    print (j, clf)
    clf.fit(x_train, y_train)
    y_submission = clf.predict_proba(x_train)[:, 1]
    dataset_blend_train[:, j] = y_submission

0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
1 LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-05,
          verbose=0, warm_start=False)


  " = {}.".format(self.n_jobs))


2 LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-05,
          verbose=0, warm_start=False)


  " = {}.".format(self.n_jobs))


In [56]:
dataset_blend_train

array([[ 0.79      ,  0.62541628,  0.63116392],
       [ 0.19      ,  0.41167685,  0.4299593 ],
       [ 0.92      ,  0.86243693,  0.8663499 ],
       ..., 
       [ 0.84      ,  0.29107294,  0.29296397],
       [ 0.2       ,  0.52584289,  0.52574765],
       [ 0.12      ,  0.17203933,  0.17056315]])

In [57]:
clf2 = LogisticRegression(C=100,class_weight=None,tol=1e-05, random_state=1234, n_jobs=-1)
ls = cross_val_score(clf2, dataset_blend_train, y_train, cv=5, scoring='roc_auc')
print(ls)
np.mean(ls)

[ 1.  1.  1.  1.  1.]


  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


1.0

In [58]:
kf = StratifiedKFold(n_splits=5,shuffle=False)
for j, clf in enumerate(clfs):
    print (j, clf)
    for i, (train_, test_) in enumerate(kf.split(x_train, y_train)):
        print ("Fold", i)
        X_tr = x_train[train_]
        y_tr = y_train[train_]
        X_te = x_train[test_]
        y_te = y_train[test_]
        clf.fit(X_tr, y_tr)
        y_submission = clf.predict_proba(X_te)[:, 1]
        dataset_blend_train[test_, j] = y_submission

0 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
1 LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-05,
          verbose=0, warm_start=False)
Fold 0


  " = {}.".format(self.n_jobs))


Fold 1
Fold 2
Fold 3
Fold 4
2 LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-05,
          verbose=0, warm_start=False)
Fold 0


  " = {}.".format(self.n_jobs))


Fold 1
Fold 2
Fold 3
Fold 4


In [59]:
dataset_blend_train

array([[ 0.54      ,  0.60015406,  0.61506819],
       [ 0.4       ,  0.45835682,  0.4848695 ],
       [ 0.57      ,  0.86152048,  0.86655162],
       ..., 
       [ 0.49      ,  0.26021496,  0.26707557],
       [ 0.5       ,  0.52159219,  0.5195483 ],
       [ 0.26      ,  0.16977267,  0.16895052]])

In [60]:
clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
ls = cross_val_score(clf2, x_train, y_train, cv=5, scoring='roc_auc')
print(ls)
np.mean(ls)

[ 0.70338126  0.70572479  0.69920753  0.69357057  0.70512031]


0.70140089238096515

In [61]:
clf2 = LogisticRegression(C=100,class_weight=None,tol=1e-05, random_state=1234, n_jobs=-1)
ls = cross_val_score(clf2, dataset_blend_train, y_train, cv=5, scoring='roc_auc')
print(ls)
np.mean(ls)

[ 0.71859377  0.71797816  0.71348507  0.70361561  0.71252682]


  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


0.71323988646047964

### Блэндинг своими руками

Блэндинг - это стекинг с количеством фолдов=2. Очень часто используется когда нет достаточно времени, чтобы делать полынй стекинг

<img src='stacking-2b.png'>

### Бэггинг