In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier, Dataset
from catboost import CatBoostClassifier

from get_metrics1 import get_metrics_classification
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from SupFunctions import check_overfitting_classification
import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore")
simplefilter("ignore", category=RuntimeWarning)

RAND = 10
N_FOLD = 5


In [2]:
df_dota = pd.read_csv('df_dota.csv')

In [3]:
df_dota.head()

Unnamed: 0,match_id,radiant_win,duration,Support,Initiator,Pusher,Escape,Nuker,Carry,Durable,Disabler,early_game_gold_adv,early_game_xp_adv,radiant_bottom_tier_1,radiant_middle_tier_1,radiant_top_tier_1,dire_bottom_tier_1,dire_middle_tier_1,dire_top_tier_1
0,7967435715,1,35,3,2,1,3,4,3,2,5,3844,1721,0,0,0,0,0,0
1,7967438032,1,41,2,4,2,1,5,3,3,5,1238,484,0,0,0,0,0,0
2,7967459521,0,28,2,2,2,3,2,4,1,3,-380,29,0,0,1,1,0,0
3,7967484383,0,26,0,3,1,4,4,1,2,3,-2797,-2048,0,0,0,0,0,0
4,7967484981,0,27,2,3,2,2,5,3,4,5,-1454,-2202,0,0,0,0,1,0


In [4]:
df_dota.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10516 entries, 0 to 10515
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   match_id               10516 non-null  int64
 1   radiant_win            10516 non-null  int64
 2   duration               10516 non-null  int64
 3   Support                10516 non-null  int64
 4   Initiator              10516 non-null  int64
 5   Pusher                 10516 non-null  int64
 6   Escape                 10516 non-null  int64
 7   Nuker                  10516 non-null  int64
 8   Carry                  10516 non-null  int64
 9   Durable                10516 non-null  int64
 10  Disabler               10516 non-null  int64
 11  early_game_gold_adv    10516 non-null  int64
 12  early_game_xp_adv      10516 non-null  int64
 13  radiant_bottom_tier_1  10516 non-null  int64
 14  radiant_middle_tier_1  10516 non-null  int64
 15  radiant_top_tier_1     10516 non-nul

# Split data

Как было отмечено в EDA анализе, дизбаланс классов отсутствует.

In [5]:
X = df_dota.drop(['radiant_win', 'match_id'], axis=1)
y = df_dota['radiant_win']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    stratify=y, 
                                                    random_state=RAND)

In [6]:
meta_X = pd.DataFrame(index=X_train.index)  # Создаем DataFrame с теми же индексами, что и X_train
meta_X_test = pd.DataFrame(index=X_test.index)

Для исследования были выбраны модели LightGBM, Catboost и RandomForest, подбор параметров будет реализован для моделей LightGBM, Catboost так как они лучше всего справляются с категориальными данными. RandomForestClassifier был выбран в качестве доп модели для стэкинга.

## Random Forest Baseline

1. Обучим baseline Random Forest на HoldOut:

In [7]:
score_oof = []
finish_test_preds = []
finish_test_preds_proba = []

skf = StratifiedKFold(n_splits=N_FOLD, random_state=RAND, shuffle=True)
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    model = RandomForestClassifier(random_state=RAND) 

    model.fit(X_train_, y_train_)

    # oof
    preds_val = model.predict(X_val)
    score_oof.append(f1_score(y_val, preds_val))

    print("Fold:", fold + 1, "F1-SCORE Validation %.3f" % f1_score(y_val, preds_val))
    print("---")

    # holdout
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)[:, 1]

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    meta_X.loc[X_val.index, 'rf_01'] = model.predict_proba(X_val)[:, 1] 

print('F1-SCORE mean oof: %.3f, std: %.3f' % (np.mean(score_oof), np.std(score_oof)))

meta_X_test['rf_01'] = np.mean(finish_test_preds_proba, axis=0)

Fold: 1 F1-SCORE Validation 0.715
---
Fold: 2 F1-SCORE Validation 0.700
---
Fold: 3 F1-SCORE Validation 0.707
---
Fold: 4 F1-SCORE Validation 0.712
---
Fold: 5 F1-SCORE Validation 0.715
---
F1-SCORE mean oof: 0.710, std: 0.006


2. Получим **предсказанные значения на Holdout** данных при помощи усреднения значений (для вероятностй) и моды (для меток классов), полученных при обучении на фолдах и выведем основные метрики.

In [8]:
# Находим моду по классам
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Находим среднее по вероятностям
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

# Вычисляем F1-макро для валидации (OOF)
print('F1-macro mean OOF: %.3f, std: %.3f' %
      (np.mean(score_oof), np.std(score_oof)))

# Вычисляем F1-макро для тестового набора
f1_test = f1_score(y_test, test_pred, average='macro')
print('F1-macro HOLDOUT: %.3f' % f1_test)

F1-macro mean OOF: 0.710, std: 0.006
F1-macro HOLDOUT: 0.723


In [9]:
df_metrics = get_metrics_classification(y_test, 
                                        test_pred, 
                                        test_pred_proba,
                                        "Random Forest baseline Holdout")
df_metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Logloss,Precision,Recall,f1
0,Random Forest baseline Holdout,0.723859,0.819073,0.499654,0.739712,0.686724,0.712234


In [10]:
check_overfitting_classification(model,
                  X_train,
                  y_train,
                  X_test,
                  y_test)

ROC-AUC train: 0.989
ROC-AUC test: 0.811
delta = 21.9 %


(0.9887220941686229, 0.8113296628923112, 21.864408438355984)

## CatBoost Classifier Baseline

1. Обучим baseline Catboost на HoldOut:

In [11]:
skf = StratifiedKFold(n_splits=N_FOLD, random_state=RAND, shuffle=True)
score_oof = []
finish_test_preds = []
finish_test_preds_proba = []
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    model = CatBoostClassifier(random_state=RAND, verbose=0) 

    model.fit(X_train_, y_train_, eval_set=(X_val, y_val), early_stopping_rounds=10)

    # oof
    preds_val = model.predict(X_val)
    score_oof.append(f1_score(y_val, preds_val))

    print("Fold:", fold + 1, "F1-SCORE Validation %.3f" % f1_score(y_val, preds_val))
    print("---")

    # holdout
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)[:, 1]

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    meta_X.loc[X_val.index, 'catboost_01'] = model.predict_proba(X_val)[:, 1]  

print('F1-SCORE mean oof: %.3f, std: %.3f' % (np.mean(score_oof), np.std(score_oof)))

meta_X_test['catboost_01'] = np.mean(finish_test_preds_proba, axis=0) 


Fold: 1 F1-SCORE Validation 0.733
---
Fold: 2 F1-SCORE Validation 0.725
---
Fold: 3 F1-SCORE Validation 0.714
---
Fold: 4 F1-SCORE Validation 0.718
---
Fold: 5 F1-SCORE Validation 0.730
---
F1-SCORE mean oof: 0.724, std: 0.007


2. Получим **предсказанные значения на Holdout** данных при помощи усреднения значений (для вероятностй) и моды (для меток классов), полученных при обучении на фолдах и выведем основные метрики.

In [12]:
# Находим моду по классам
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Находим среднее по вероятностям
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

# Вычисляем F1-макро для валидации (OOF)
print('F1-macro mean OOF: %.3f, std: %.3f' %
      (np.mean(score_oof), np.std(score_oof)))

# Вычисляем F1-макро для тестового набора
f1_test = f1_score(y_test, test_pred, average='macro')
print('F1-macro HOLDOUT: %.3f' % f1_test)

F1-macro mean OOF: 0.724, std: 0.007
F1-macro HOLDOUT: 0.721


In [13]:
df_metrics = pd.concat([df_metrics, get_metrics_classification(y_test, 
                                        test_pred, 
                                        test_pred_proba,
                                        "Catboost baseline Holdout")], axis=0)
df_metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Logloss,Precision,Recall,f1
0,Random Forest baseline Holdout,0.723859,0.819073,0.499654,0.739712,0.686724,0.712234
0,Catboost baseline Holdout,0.721008,0.823945,0.494296,0.737603,0.681948,0.708685


In [14]:
check_overfitting_classification(model,
                  X_train,
                  y_train,
                  X_test,
                  y_test)

ROC-AUC train: 0.858
ROC-AUC test: 0.824
delta = 4.2 %


(0.8584798538163347, 0.8236661217932211, 4.2266800954882)

## LGBM Classifier Baseline

In [15]:
skf = StratifiedKFold(n_splits=N_FOLD, random_state=RAND, shuffle=True)
score_oof = []
finish_test_preds = []
finish_test_preds_proba = []
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    model = LGBMClassifier(objective='binary',
                           random_state=RAND,
                           verbose=0)

    model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], eval_metric='logloss') 

    # oof
    preds_val = model.predict(X_val)
    score_oof.append(f1_score(y_val, preds_val))

    print("Fold:", fold + 1, "F1-SCORE Validation %.3f" % f1_score(y_val, preds_val))
    print("---")

    # holdout
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)[:, 1]

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    meta_X.loc[X_val.index, 'lgbm_01'] = model.predict_proba(X_val)[:, 1] 

print('F1-SCORE mean oof: %.3f, std: %.3f' % (np.mean(score_oof), np.std(score_oof)))

meta_X_test['lgbm_01'] = np.mean(finish_test_preds_proba, axis=0)

Fold: 1 F1-SCORE Validation 0.721
---
Fold: 2 F1-SCORE Validation 0.703
---
Fold: 3 F1-SCORE Validation 0.707
---
Fold: 4 F1-SCORE Validation 0.714
---
Fold: 5 F1-SCORE Validation 0.718
---
F1-SCORE mean oof: 0.713, std: 0.007


In [16]:
# Находим моду по классам
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Находим среднее по вероятностям
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

# Вычисляем F1-макро для валидации (OOF)
print('F1-macro mean OOF: %.3f, std: %.3f' %
      (np.mean(score_oof), np.std(score_oof)))

# Вычисляем F1-макро для тестового набора
f1_test = f1_score(y_test, test_pred, average='macro')
print('F1-macro HOLDOUT: %.3f' % f1_test)

F1-macro mean OOF: 0.713, std: 0.007
F1-macro HOLDOUT: 0.713


In [17]:
df_metrics = pd.concat([df_metrics, get_metrics_classification(y_test, 
                                        test_pred, 
                                        test_pred_proba,
                                        "LightGBM baseline Holdout")], axis=0)
df_metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Logloss,Precision,Recall,f1
0,Random Forest baseline Holdout,0.723859,0.819073,0.499654,0.739712,0.686724,0.712234
0,Catboost baseline Holdout,0.721008,0.823945,0.494296,0.737603,0.681948,0.708685
0,LightGBM baseline Holdout,0.712928,0.820036,0.497216,0.72579,0.680038,0.70217


In [18]:
check_overfitting_classification(model,
                  X_train,
                  y_train,
                  X_test,
                  y_test)

ROC-AUC train: 0.915
ROC-AUC test: 0.817
delta = 12.0 %


(0.9152861986170211, 0.8173020360917663, 11.988733442266032)

In [19]:
meta_X

Unnamed: 0,rf_01,catboost_01,lgbm_01
6309,0.94,0.960736,0.974764
218,0.20,0.283706,0.156880
3270,0.63,0.616160,0.588529
2101,0.31,0.360419,0.370514
2918,0.56,0.506338,0.427501
...,...,...,...
1323,0.42,0.564079,0.490108
6055,0.89,0.777810,0.904476
1898,0.63,0.528694,0.650589
7595,0.67,0.647198,0.543239


In [20]:
meta_X_test

Unnamed: 0,rf_01,catboost_01,lgbm_01
2004,0.316,0.397284,0.380385
5616,0.044,0.065483,0.048569
2635,0.214,0.338947,0.179918
7907,0.020,0.039754,0.023750
1667,0.946,0.947815,0.994234
...,...,...,...
8801,0.434,0.393050,0.436554
9389,0.454,0.459614,0.451510
4600,0.102,0.078093,0.073371
7297,0.072,0.094033,0.118452


In [21]:
meta_X['index'] = meta_X.index

In [22]:
meta_X_test['index'] = meta_X_test.index

In [23]:
meta_X.to_csv('meta_X.csv', index=False)

In [24]:
meta_X_test.to_csv('meta_X_test.csv', index=False)

In [25]:
df_metrics.to_csv('models_metrics.csv', index=False)

In [26]:
meta_X

Unnamed: 0,rf_01,catboost_01,lgbm_01,index
6309,0.94,0.960736,0.974764,6309
218,0.20,0.283706,0.156880,218
3270,0.63,0.616160,0.588529,3270
2101,0.31,0.360419,0.370514,2101
2918,0.56,0.506338,0.427501,2918
...,...,...,...,...
1323,0.42,0.564079,0.490108,1323
6055,0.89,0.777810,0.904476,6055
1898,0.63,0.528694,0.650589,1898
7595,0.67,0.647198,0.543239,7595
