### Combine f5 with heroes id and delete old heores id

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from plotly import tools
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import eli5
from sklearn import decomposition
import lightgbm as lgb
import xgboost as xgb

import os
from IPython.display import display_html

import json
from tqdm import tqdm_notebook

In [2]:
df_train_pickle = pd.read_pickle("./train_features_5.pkl")
df_test_pickle = pd.read_pickle("./test_features_5.pkl")

In [3]:
X_heroes_train = pd.read_pickle("./train_heroes_id.pkl")
X_heroes_test = pd.read_pickle("./test_heroes_id.pkl")

In [4]:
df_train_pickle.head(3)

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,...,total_damage_received_ratio,r_std_damage_received,d_std_damage_received,std_damage_received_ratio,r_mean_damage_received,d_mean_damage_received,mean_damage_received_ratio,radiant_tower_kills,dire_tower_kills,diff_tower_kills
0,155.0,22.0,7.0,1.0,11.0,11.0,0.0,0.0,0.0,0.0,...,2.727974,375.188353,244.929582,1.531821,848.4,311.0,2.727974,0.0,0.0,0.0
1,658.0,4.0,0.0,3.0,10.0,15.0,7.0,2.0,0.0,7.0,...,0.866879,2286.08635,2169.544929,1.053717,4240.6,4891.8,0.866879,2.0,0.0,2.0
2,21.0,23.0,0.0,0.0,0.0,101.0,0.0,0.0,0.0,0.0,...,0.971429,30.410524,31.304952,0.971429,13.6,14.0,0.971429,0.0,0.0,0.0


In [5]:
df_train_pickle.columns

Index(['game_time', 'game_mode', 'lobby_type', 'objectives_len', 'chat_len',
       'r1_hero_id', 'r1_kills', 'r1_deaths', 'r1_assists', 'r1_denies',
       ...
       'total_damage_received_ratio', 'r_std_damage_received',
       'd_std_damage_received', 'std_damage_received_ratio',
       'r_mean_damage_received', 'd_mean_damage_received',
       'mean_damage_received_ratio', 'radiant_tower_kills', 'dire_tower_kills',
       'diff_tower_kills'],
      dtype='object', length=569)

In [6]:
X_heroes_train.head(3)

Unnamed: 0_level_0,id001,id002,id003,id004,id005,id006,id007,id008,id009,id010,...,id107,id108,id109,id110,id111,id112,id113,id114,id119,id120
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
b9c57c450ce74a2af79c9ce96fac144d,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6db558535151ea18ca70a6892197db41,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X_heroes_train.index = df_train_pickle.index

In [8]:
X_heroes_train.head(3)

Unnamed: 0,id001,id002,id003,id004,id005,id006,id007,id008,id009,id010,...,id107,id108,id109,id110,id111,id112,id113,id114,id119,id120
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X_heroes_test.index = df_test_pickle.index

In [10]:
heroes_ids = [f'{t}{i}_hero_id' for t in ['r', 'd'] for i in range(1, 6)]
heroes_ids

['r1_hero_id',
 'r2_hero_id',
 'r3_hero_id',
 'r4_hero_id',
 'r5_hero_id',
 'd1_hero_id',
 'd2_hero_id',
 'd3_hero_id',
 'd4_hero_id',
 'd5_hero_id']

In [11]:
X_train = df_train_pickle.copy()
X_train.drop(columns = heroes_ids, inplace = True)
X_train.head(3)

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_kills,r1_deaths,r1_assists,r1_denies,r1_gold,...,total_damage_received_ratio,r_std_damage_received,d_std_damage_received,std_damage_received_ratio,r_mean_damage_received,d_mean_damage_received,mean_damage_received_ratio,radiant_tower_kills,dire_tower_kills,diff_tower_kills
0,155.0,22.0,7.0,1.0,11.0,0.0,0.0,0.0,0.0,543.0,...,2.727974,375.188353,244.929582,1.531821,848.4,311.0,2.727974,0.0,0.0,0.0
1,658.0,4.0,0.0,3.0,10.0,7.0,2.0,0.0,7.0,5257.0,...,0.866879,2286.08635,2169.544929,1.053717,4240.6,4891.8,0.866879,2.0,0.0,2.0
2,21.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,176.0,...,0.971429,30.410524,31.304952,0.971429,13.6,14.0,0.971429,0.0,0.0,0.0


In [12]:
X_test = df_test_pickle.copy()
X_test.drop(columns = heroes_ids, inplace = True)
X_test.head(3)

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_kills,r1_deaths,r1_assists,r1_denies,r1_gold,...,total_damage_received_ratio,r_std_damage_received,d_std_damage_received,std_damage_received_ratio,r_mean_damage_received,d_mean_damage_received,mean_damage_received_ratio,radiant_tower_kills,dire_tower_kills,diff_tower_kills
0,23.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,115.0,...,0.390821,50.415275,194.104869,0.259732,56.2,143.8,0.390821,0.0,0.0,0.0
1,1044.0,22.0,7.0,12.0,6.0,3.0,5.0,7.0,1.0,5864.0,...,1.045873,2383.98448,2904.398389,0.820819,10328.0,9875.0,1.045873,7.0,2.0,5.0
2,1091.0,22.0,7.0,6.0,1.0,3.0,1.0,7.0,1.0,4351.0,...,0.809324,3028.941284,2068.178716,1.464545,7374.4,9111.8,0.809324,4.0,1.0,3.0


In [13]:
X_train_full = pd.concat([X_train, X_heroes_train], axis = 1)
X_train_full.head(3)

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_kills,r1_deaths,r1_assists,r1_denies,r1_gold,...,id107,id108,id109,id110,id111,id112,id113,id114,id119,id120
0,155.0,22.0,7.0,1.0,11.0,0.0,0.0,0.0,0.0,543.0,...,0,0,0,0,0,0,0,0,0,0
1,658.0,4.0,0.0,3.0,10.0,7.0,2.0,0.0,7.0,5257.0,...,0,0,0,0,0,0,0,0,0,0
2,21.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,176.0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X_test_full = pd.concat([X_test, X_heroes_test], axis = 1)
X_test_full.head(3)

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_kills,r1_deaths,r1_assists,r1_denies,r1_gold,...,id107,id108,id109,id110,id111,id112,id113,id114,id119,id120
0,23.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,115.0,...,0,0,0,0,0,0,0,0,0,-1
1,1044.0,22.0,7.0,12.0,6.0,3.0,5.0,7.0,1.0,5864.0,...,0,0,0,0,0,0,0,0,-1,0
2,1091.0,22.0,7.0,6.0,1.0,3.0,1.0,7.0,1.0,4351.0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
columns = list(X_train_full.columns)
columns

['game_time',
 'game_mode',
 'lobby_type',
 'objectives_len',
 'chat_len',
 'r1_kills',
 'r1_deaths',
 'r1_assists',
 'r1_denies',
 'r1_gold',
 'r1_lh',
 'r1_xp',
 'r1_health',
 'r1_max_health',
 'r1_max_mana',
 'r1_level',
 'r1_x',
 'r1_y',
 'r1_stuns',
 'r1_creeps_stacked',
 'r1_camps_stacked',
 'r1_rune_pickups',
 'r1_firstblood_claimed',
 'r1_teamfight_participation',
 'r1_towers_killed',
 'r1_roshans_killed',
 'r1_obs_placed',
 'r1_sen_placed',
 'r1_ability_level',
 'r1_max_hero_hit',
 'r1_purchase_count',
 'r1_count_ability_use',
 'r1_damage_dealt',
 'r1_damage_received',
 'r2_kills',
 'r2_deaths',
 'r2_assists',
 'r2_denies',
 'r2_gold',
 'r2_lh',
 'r2_xp',
 'r2_health',
 'r2_max_health',
 'r2_max_mana',
 'r2_level',
 'r2_x',
 'r2_y',
 'r2_stuns',
 'r2_creeps_stacked',
 'r2_camps_stacked',
 'r2_rune_pickups',
 'r2_firstblood_claimed',
 'r2_teamfight_participation',
 'r2_towers_killed',
 'r2_roshans_killed',
 'r2_obs_placed',
 'r2_sen_placed',
 'r2_ability_level',
 'r2_max_hero_h

In [24]:
columns.index('d5_damage_received')

294

In [27]:
columns_to_drop = columns[5:295]
columns_to_drop

['r1_kills',
 'r1_deaths',
 'r1_assists',
 'r1_denies',
 'r1_gold',
 'r1_lh',
 'r1_xp',
 'r1_health',
 'r1_max_health',
 'r1_max_mana',
 'r1_level',
 'r1_x',
 'r1_y',
 'r1_stuns',
 'r1_creeps_stacked',
 'r1_camps_stacked',
 'r1_rune_pickups',
 'r1_firstblood_claimed',
 'r1_teamfight_participation',
 'r1_towers_killed',
 'r1_roshans_killed',
 'r1_obs_placed',
 'r1_sen_placed',
 'r1_ability_level',
 'r1_max_hero_hit',
 'r1_purchase_count',
 'r1_count_ability_use',
 'r1_damage_dealt',
 'r1_damage_received',
 'r2_kills',
 'r2_deaths',
 'r2_assists',
 'r2_denies',
 'r2_gold',
 'r2_lh',
 'r2_xp',
 'r2_health',
 'r2_max_health',
 'r2_max_mana',
 'r2_level',
 'r2_x',
 'r2_y',
 'r2_stuns',
 'r2_creeps_stacked',
 'r2_camps_stacked',
 'r2_rune_pickups',
 'r2_firstblood_claimed',
 'r2_teamfight_participation',
 'r2_towers_killed',
 'r2_roshans_killed',
 'r2_obs_placed',
 'r2_sen_placed',
 'r2_ability_level',
 'r2_max_hero_hit',
 'r2_purchase_count',
 'r2_count_ability_use',
 'r2_damage_dealt',
 'r

In [28]:
X_train_reduced = X_train_full.copy()
X_train_reduced.drop(columns = columns_to_drop, inplace = True)
X_train_reduced.head(3)

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,chat_len,r_total_kills,d_total_kills,total_kills_ratio,r_std_kills,d_std_kills,...,id107,id108,id109,id110,id111,id112,id113,id114,id119,id120
0,155.0,22.0,7.0,1.0,11.0,0.0,1.0,0.0,0.0,0.447214,...,0,0,0,0,0,0,0,0,0,0
1,658.0,4.0,0.0,3.0,10.0,16.0,3.0,5.333333,2.48998,0.547723,...,0,0,0,0,0,0,0,0,0,0
2,21.0,23.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
X_test_reduced = X_test_full.copy()
X_test_reduced.drop(columns = columns_to_drop, inplace = True)
X_test_reduced.head(3)

Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,chat_len,r_total_kills,d_total_kills,total_kills_ratio,r_std_kills,d_std_kills,...,id107,id108,id109,id110,id111,id112,id113,id114,id119,id120
0,23.0,4.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,0,0,0,0,0,0,0,0,0,-1
1,1044.0,22.0,7.0,12.0,6.0,31.0,11.0,2.818182,2.387467,2.04939,...,0,0,0,0,0,0,0,0,-1,0
2,1091.0,22.0,7.0,6.0,1.0,22.0,13.0,1.692308,1.516575,0.894427,...,0,0,0,0,0,0,0,0,0,0


In [31]:
X_test_reduced.fillna(0)
X_train_reduced.fillna(0);

In [32]:
columns_to_cat = columns[-115:]
columns_to_cat

['id001',
 'id002',
 'id003',
 'id004',
 'id005',
 'id006',
 'id007',
 'id008',
 'id009',
 'id010',
 'id011',
 'id012',
 'id013',
 'id014',
 'id015',
 'id016',
 'id017',
 'id018',
 'id019',
 'id020',
 'id021',
 'id022',
 'id023',
 'id025',
 'id026',
 'id027',
 'id028',
 'id029',
 'id030',
 'id031',
 'id032',
 'id033',
 'id034',
 'id035',
 'id036',
 'id037',
 'id038',
 'id039',
 'id040',
 'id041',
 'id042',
 'id043',
 'id044',
 'id045',
 'id046',
 'id047',
 'id048',
 'id049',
 'id050',
 'id051',
 'id052',
 'id053',
 'id054',
 'id055',
 'id056',
 'id057',
 'id058',
 'id059',
 'id060',
 'id061',
 'id062',
 'id063',
 'id064',
 'id065',
 'id066',
 'id067',
 'id068',
 'id069',
 'id070',
 'id071',
 'id072',
 'id073',
 'id074',
 'id075',
 'id076',
 'id077',
 'id078',
 'id079',
 'id080',
 'id081',
 'id082',
 'id083',
 'id084',
 'id085',
 'id086',
 'id087',
 'id088',
 'id089',
 'id090',
 'id091',
 'id092',
 'id093',
 'id094',
 'id095',
 'id096',
 'id097',
 'id098',
 'id099',
 'id100',
 'id101',


In [35]:
X_train_reduced[columns_to_cat].astype('category')
X_test_reduced[columns_to_cat].astype('category');

In [45]:
pd.to_pickle(X_train_reduced, "./train_features_6.pkl")
pd.to_pickle(X_test_reduced, "./test_features_6.pkl")

In [None]:
np.isnan(X_train_reduced)

## Train the model !

In [38]:
def train_model(X, X_test, y, params, folds, model_type='lgb', plot_feature_importance=False, averaging='usual', model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.loc[train_index], X.loc[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        if model_type == 'lgb':
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_valid, label=y_valid)
            
            model = lgb.train(params,
                    train_data,
                    num_boost_round=20000,
                    valid_sets = [train_data, valid_data],
                    verbose_eval=1000,
                    early_stopping_rounds = 200)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X_train.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X_train.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            y_pred_valid = model.predict_proba(X_valid).reshape(-1,)
            score = roc_auc_score(y_valid, y_pred_valid)
            # print(f'Fold {fold_n}. AUC: {score:.4f}.')
            # print('')
            
            y_pred = model.predict_proba(X_test)[:, 1]
            
        if model_type == 'glm':
            model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
            model_results = model.fit()
            model_results.predict(X_test)
            y_pred_valid = model_results.predict(X_valid).reshape(-1,)
            score = roc_auc_score(y_valid, y_pred_valid)
            
            y_pred = model_results.predict(X_test)
            
        if model_type == 'cat':
            model = CatBoostClassifier(iterations=20000, learning_rate=0.05, loss_function='Logloss',  eval_metric='AUC', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred = model.predict_proba(X_test)[:, 1]
            
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(roc_auc_score(y_valid, y_pred_valid))

        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values  
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importance()
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction, scores
    
    else:
        return oof, prediction, scores

In [39]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

params = {'boost': 'gbdt',
          'feature_fraction': 0.05,
          'learning_rate': 0.01,
          'max_depth': -1,  
          'metric':'auc',
          'min_data_in_leaf': 50,
          'num_leaves': 32,
          'num_threads': -1,
          'verbosity': 1,
          'objective': 'binary'
         }

In [40]:
PATH_TO_DATA = '../../data/dota_2/'

df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            'train_targets.csv'), 
                                   index_col='match_id_hash')

y = df_train_targets['radiant_win'].values

In [42]:
import time
oof_lgb, prediction_lgb, scores = train_model(X_train_reduced, X_test_reduced, y, params=params, 
                                              folds=folds, model_type='lgb', 
                                              plot_feature_importance=True)

Fold 0 started at Sun Nov 17 18:44:45 2019


KeyboardInterrupt: 

In [44]:
from sklearn.model_selection import ShuffleSplit, KFold
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=17)

#cv_scores_base = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
cv_scores = cross_val_score(model, X_train_reduced.values, y, 
                                     cv=cv, scoring='roc_auc', n_jobs=-1)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').