In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from plotly import tools
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import eli5
from sklearn import decomposition
import lightgbm as lgb
import xgboost as xgb

import os
from IPython.display import display_html

import json
from tqdm import tqdm_notebook

In [2]:
X_train = pd.read_pickle("./train_features_7.pkl")
X_test = pd.read_pickle("./test_features_7.pkl")

In [6]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
X_train_df = pd.DataFrame(data = X_train_scaled)
X_test_df = pd.DataFrame(data = X_test_scaled)

In [10]:
X_train_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,378,379,380,381,382,383,384,385,386,387
0,-1.291823,0.383071,0.683215,-0.851023,0.273937,-1.188395,-1.090415,-0.999064,-1.238755,-0.980373,...,0.001402,0.00339,-0.01492,-0.002408,-0.001563,-0.005492,0.000677,-0.002193,-0.003059,-0.007542
1,-0.63619,-2.471853,-1.463668,-0.542953,0.199121,0.031248,-0.942096,3.770391,0.144414,-0.925769,...,0.001402,0.00339,-0.01492,-0.002408,-0.001563,-0.005492,0.000677,-0.002193,-0.003059,-0.007542
2,-1.466485,0.541678,-1.463668,-1.005058,-0.549034,-1.188395,-1.164575,-0.999064,-1.238755,-1.223333,...,0.001402,0.00339,-0.01492,-0.002408,-0.001563,-0.005492,0.000677,-0.002193,-0.003059,-0.007542


In [20]:
X_train_df.shape

(39675, 388)

In [21]:
X_test_df.shape

(10000, 388)

In [11]:
model = LogisticRegression(random_state=42, solver='liblinear')


In [16]:
def train_model(X, X_test, y, params, folds, model_type='lgb', plot_feature_importance=False, averaging='usual', model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.loc[train_index], X.loc[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        if model_type == 'lgb':
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_valid, label=y_valid)
            
            model = lgb.train(params,
                    train_data,
                    num_boost_round=20000,
                    valid_sets = [train_data, valid_data],
                    verbose_eval=1000,
                    early_stopping_rounds = 200)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X_train.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X_train.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            y_pred_valid = model.predict_proba(X_valid).reshape(-1,)
            score = roc_auc_score(y_valid, y_pred_valid)
            print(f'Fold {fold_n}. AUC: {score:.4f}.')
            print('')
            
            y_pred = model.predict_proba(X_test)[:, 1]
            
        if model_type == 'glm':
            model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
            model_results = model.fit()
            model_results.predict(X_test)
            y_pred_valid = model_results.predict(X_valid).reshape(-1,)
            score = roc_auc_score(y_valid, y_pred_valid)
            
            y_pred = model_results.predict(X_test)
            
        if model_type == 'cat':
            model = CatBoostClassifier(iterations=20000, learning_rate=0.05, loss_function='Logloss',  eval_metric='AUC', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred = model.predict_proba(X_test)[:, 1]
            
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(roc_auc_score(y_valid, y_pred_valid))

        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values  
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importance()
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction, scores
    
    else:
        return oof, prediction, scores

In [17]:
n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

params = []

In [18]:
PATH_TO_DATA = '../../data/dota_2/'

df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            'train_targets.csv'), 
                                   index_col='match_id_hash')

y = df_train_targets['radiant_win'].values

In [22]:
from sklearn.model_selection import ShuffleSplit, KFold
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)

In [23]:
cv_scores_extended = cross_val_score(model, X_train_scaled, y, 
                                     cv=cv, scoring='roc_auc', n_jobs=-1)

In [24]:
print('Extended features: mean={} scores={}'.format(cv_scores_extended.mean(), 
                                                    cv_scores_extended))

Extended features: mean=0.8428043499914086 scores=[0.84120501 0.84489773 0.84182356 0.84318956 0.84290589]


In [26]:
df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                             'test_features.csv'), 
                                    index_col='match_id_hash')

In [28]:
model.fit(X_train_scaled, y)
df_submission = pd.DataFrame(
    {'radiant_win_prob': model.predict_proba(X_test_scaled)[:, 1]}, 
    index=df_test_features.index,)

df_submission.to_csv('submission_lr0_f7.csv')

In [30]:
SEED = 17

In [31]:
def logit_cv(X_heroes_train, y_train, cv=5, random_state=SEED):
    logit = LogisticRegression(random_state=SEED, solver='liblinear')

    c_values = np.logspace(-2, 1, 20)

    logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                       scoring='roc_auc',return_train_score=False, cv=cv,
                                       n_jobs=-1, verbose=0)

    logit_grid_searcher.fit(X_heroes_train, y_train)
    
    cv_scores = []
    for i in range(logit_grid_searcher.n_splits_):
        cv_scores.append(logit_grid_searcher.cv_results_[f'split{i}_test_score'][logit_grid_searcher.best_index_])
    print(f'CV scores: {cv_scores}')
    print(f'Mean: {np.mean(cv_scores)}, std: {np.std(cv_scores)}\n')
    
    return logit_grid_searcher.best_estimator_, np.array(cv_scores)

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
logit_1, cv_scores_1 = logit_cv(X_train_scaled, y, cv=folds, random_state=SEED)

CV scores: [0.846664484885221, 0.8342081447963801, 0.8470962708690903, 0.8481021267923615, 0.8414891560232478]
Mean: 0.8435120366732601, std: 0.005186676045059978



In [36]:
logit_1.fit(X_train_scaled, y)
df_submission = pd.DataFrame(
    {'radiant_win_prob': logit_1.predict_proba(X_test_scaled)[:, 1]}, 
    index=df_test_features.index,)

df_submission.to_csv('submission_lr1_f7.csv')

In [37]:
X_train.to_csv('X_train_for_cat.csv')
X_test.to_csv('X_test_for_cat.csv')