In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier, Dataset
from catboost import CatBoostClassifier
from get_metrics1 import get_metrics_classification
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import optuna
import shap
from sklearn.inspection import permutation_importance
from SupFunctions import barplot_group, barplot_balance, check_overfitting_classification, scale_pos_weight_calc
import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore")
simplefilter("ignore", category=RuntimeWarning)

RAND = 10
N_FOLD = 5

In [2]:
df_dota = pd.read_csv('df_dota.csv')

In [3]:
df_metrics = pd.read_csv('models_metrics.csv')

In [4]:
meta_X = pd.read_csv('meta_X.csv')

In [5]:
meta_X_test = pd.read_csv('meta_X_test.csv')

In [6]:
df_dota.head()

Unnamed: 0,match_id,radiant_win,duration,Support,Initiator,Pusher,Escape,Nuker,Carry,Durable,Disabler,early_game_gold_adv,early_game_xp_adv,radiant_bottom_tier_1,radiant_middle_tier_1,radiant_top_tier_1,dire_bottom_tier_1,dire_middle_tier_1,dire_top_tier_1
0,7967435715,1,35,3,2,1,3,4,3,2,5,3844,1721,0,0,0,0,0,0
1,7967438032,1,41,2,4,2,1,5,3,3,5,1238,484,0,0,0,0,0,0
2,7967459521,0,28,2,2,2,3,2,4,1,3,-380,29,0,0,1,1,0,0
3,7967484383,0,26,0,3,1,4,4,1,2,3,-2797,-2048,0,0,0,0,0,0
4,7967484981,0,27,2,3,2,2,5,3,4,5,-1454,-2202,0,0,0,0,1,0


In [7]:
df_dota.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10516 entries, 0 to 10515
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   match_id               10516 non-null  int64
 1   radiant_win            10516 non-null  int64
 2   duration               10516 non-null  int64
 3   Support                10516 non-null  int64
 4   Initiator              10516 non-null  int64
 5   Pusher                 10516 non-null  int64
 6   Escape                 10516 non-null  int64
 7   Nuker                  10516 non-null  int64
 8   Carry                  10516 non-null  int64
 9   Durable                10516 non-null  int64
 10  Disabler               10516 non-null  int64
 11  early_game_gold_adv    10516 non-null  int64
 12  early_game_xp_adv      10516 non-null  int64
 13  radiant_bottom_tier_1  10516 non-null  int64
 14  radiant_middle_tier_1  10516 non-null  int64
 15  radiant_top_tier_1     10516 non-nul

In [8]:
meta_X.head()

Unnamed: 0,rf_01,catboost_01,lgbm_01,index
0,0.94,0.960736,0.974764,6309
1,0.2,0.283706,0.15688,218
2,0.63,0.61616,0.588529,3270
3,0.31,0.360419,0.370514,2101
4,0.56,0.506338,0.427501,2918


In [9]:
meta_X = meta_X.set_index('index')

In [10]:
meta_X_test.head()

Unnamed: 0,rf_01,catboost_01,lgbm_01,index
0,0.316,0.397284,0.380385,2004
1,0.044,0.065483,0.048569,5616
2,0.214,0.338947,0.179918,2635
3,0.02,0.039754,0.02375,7907
4,0.946,0.947815,0.994234,1667


In [11]:
meta_X_test = meta_X_test.set_index('index')

In [12]:
X = df_dota.drop(['radiant_win', 'match_id'], axis=1)
y = df_dota['radiant_win']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    stratify=y, 
                                                    random_state=RAND)

# Tuning

Теперь построим модели с подобранными параметрами. Для этого, для каждой модели baseline подберём параметры при помощи RandomizedSearchCV, а затем обучим на подобранных параметрах.

## Catboost Random Search

In [13]:
cat_features = X_test.select_dtypes('category').columns.tolist()

1. Сначала выберем оптимальное количество деревьев.

In [14]:
grid = {
    "n_estimators": [i for i in range(100, 3101, 500)]
}
clf_grid = CatBoostClassifier(allow_writing_files=False,
                            random_state=RAND,
                            eval_metric="AUC",
                            cat_features=cat_features,
                            scale_pos_weight=scale_pos_weight_calc(y_train),
                            verbose=0)
grid_search_result = clf_grid.randomized_search(grid,
                                             X=X_train,
                                             y=y_train,
                                             plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 0.8277415
bestIteration = 99

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.8277415	best: 0.8277415 (0)	total: 316ms	remaining: 1.89s

bestTest = 0.8280663055
bestIteration = 103

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.8280663	best: 0.8280663 (1)	total: 1.82s	remaining: 4.55s

bestTest = 0.8280663055
bestIteration = 103

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
2:	loss: 0.8280663	best: 0.8280663 (1)	total: 4.48s	remaining: 5.97s

bestTest = 0.8280663055
bestIteration = 103

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
3:	loss: 0.8280663	best: 0.8280663 (1)	total: 8.63s	remaining: 6.47s

bestTes

In [15]:
grid_search_result['params']

{'iterations': 600}

2. Затем под это количество выберем скорость обучения.

In [16]:
grid = {
    "n_estimators": [600],
     "learning_rate": np.logspace(-4, -1, 4)
}
clf_grid = CatBoostClassifier(allow_writing_files=False,
                            random_state=RAND,
                            eval_metric="AUC",
                            cat_features=cat_features,
                            scale_pos_weight=scale_pos_weight_calc(y_train),
                            verbose=0)
grid_search_result = clf_grid.randomized_search(grid,
                                             X=X_train,
                                             y=y_train,
                                             plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 0.8235966988
bestIteration = 118

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.8235967	best: 0.8235967 (0)	total: 1.44s	remaining: 4.3s

bestTest = 0.8245188641
bestIteration = 599

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.8245189	best: 0.8245189 (1)	total: 2.83s	remaining: 2.83s

bestTest = 0.8279123759
bestIteration = 586

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
2:	loss: 0.8279124	best: 0.8279124 (2)	total: 4.28s	remaining: 1.43s

bestTest = 0.8255314101
bestIteration = 32

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
3:	loss: 0.8255314	best: 0.8279124 (2)	total: 5.91s	remaining: 0us
Estimati

In [17]:
grid_search_result['params']

{'learning_rate': 0.01, 'iterations': 600}

3. Подберем остальные параметры.

In [18]:
grid = {
     "n_estimators": [600],
     "learning_rate": [0.01],
     "boosting_type" : ['Ordered', 'Plain'],
     "max_depth": list(range(3, 15)),
     "l2_leaf_reg": np.logspace(-7, 2, 7),
     "bootstrap_type": ["Bayesian", "Bernoulli", "MVS", "No"],
     'border_count': [128, 254],
     'grow_policy': ["SymmetricTree", "Depthwise", "Lossguide"]
}
clf_grid = CatBoostClassifier(allow_writing_files=False,
                            random_state=RAND,
                            eval_metric="AUC",
                            cat_features=cat_features,
                            scale_pos_weight=scale_pos_weight_calc(y_train),
                            verbose=0)

grid_search_result = clf_grid.randomized_search(grid,
                                                X=X_train,
                                                y=y_train,
                                                plot=False)



bestTest = 0.8262826994
bestIteration = 454

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.8262827	best: 0.8262827 (0)	total: 3.08s	remaining: 27.7s

bestTest = 0.8277683317
bestIteration = 134

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.8277683	best: 0.8277683 (1)	total: 5.33s	remaining: 21.3s

bestTest = 0.8272938332
bestIteration = 269

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
2:	loss: 0.8272938	best: 0.8277683 (1)	total: 7.78s	remaining: 18.2s

bestTest = 0.8285012625
bestIteration = 271

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
3:	loss: 0.8285013	best: 0.8285013 (3)	total: 40.3s	remaining: 1m

bestTe

In [19]:
best_params = grid_search_result['params']

4. Обучим Catboost с подобранными параметрами на HoldOut:

In [20]:
skf = StratifiedKFold(n_splits=N_FOLD, random_state=RAND, shuffle=True)
score_oof = []
finish_test_preds = []
finish_test_preds_proba = []
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    model = CatBoostClassifier(**best_params, random_state=RAND, verbose=0) 

    model.fit(X_train_, y_train_, eval_set=(X_val, y_val))

    # oof
    preds_val = model.predict(X_val)
    score_oof.append(f1_score(y_val, preds_val))

    print("Fold:", fold + 1, "F1-SCORE Validation %.3f" % f1_score(y_val, preds_val))
    print("---")

    # holdout
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)[:, 1]

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    meta_X.loc[X_val.index, 'catboost_best_01'] = model.predict_proba(X_val)[:, 1]  

print('F1-SCORE mean oof: %.3f, std: %.3f' % (np.mean(score_oof), np.std(score_oof)))

meta_X_test['catboost_best_01'] = np.mean(finish_test_preds_proba, axis=0) 

Fold: 1 F1-SCORE Validation 0.735
---
Fold: 2 F1-SCORE Validation 0.726
---
Fold: 3 F1-SCORE Validation 0.717
---
Fold: 4 F1-SCORE Validation 0.721
---
Fold: 5 F1-SCORE Validation 0.735
---
F1-SCORE mean oof: 0.727, std: 0.007


5. Получим **предсказанные значения на Holdout** данных при помощи усреднения значений (для вероятностй) и моды (для меток классов), полученных при обучении на фолдах и выведем основные метрики.

In [21]:
# Находим моду по классам
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Находим среднее по вероятностям
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

# Вычисляем F1-макро для валидации (OOF)
print('F1-macro mean OOF: %.3f, std: %.3f' %
      (np.mean(score_oof), np.std(score_oof)))

# Вычисляем F1-макро для тестового набора
f1_test = f1_score(y_test, test_pred, average='macro')
print('F1-macro HOLDOUT: %.3f' % f1_test)

F1-macro mean OOF: 0.727, std: 0.007
F1-macro HOLDOUT: 0.716


In [22]:
df_metrics = pd.concat([df_metrics, get_metrics_classification(y_test, 
                                        test_pred, 
                                        test_pred_proba,
                                        "Catboost best Holdout")], axis=0)
df_metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Logloss,Precision,Recall,f1
0,Random Forest baseline Holdout,0.723859,0.819073,0.499654,0.739712,0.686724,0.712234
1,Catboost baseline Holdout,0.721008,0.823945,0.494296,0.737603,0.681948,0.708685
2,LightGBM baseline Holdout,0.712928,0.820036,0.497216,0.72579,0.680038,0.70217
0,Catboost best Holdout,0.716255,0.824472,0.492006,0.732919,0.676218,0.703428


6. Проверим переобучение.

In [23]:
check_overfitting_classification(model,
                  X_train,
                  y_train,
                  X_test,
                  y_test)

ROC-AUC train: 0.864
ROC-AUC test: 0.825
delta = 4.7 %


(0.8638906844657244, 0.8250630941763599, 4.706014674929202)

Не смотря на то, что метрики слегка упали по сравнению с baseline, переобучение уменьшилось на 1,3%.

## Random forest Random Search

In [24]:
rf = RandomForestClassifier(random_state=RAND)
rf.fit(X_train, y_train)

1. Сначала выберем оптимальное количество деревьев.

In [25]:
grid = {
    'n_estimators': [i for i in range(100, 3101, 500)]
}

cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=RAND)

grid_cv = RandomizedSearchCV(rf,
                       param_distributions=grid,
                       scoring='roc_auc',
                       cv=cv,
                       verbose=0,
                       n_jobs=-1)
grid_cv.fit(X_train, y_train)

In [26]:
grid_cv.best_params_

{'n_estimators': 3100}

2. Подберем остальные параметры.

In [27]:
grid = {
    'n_estimators': [3100],
    'max_depth': [None] + list(range(10, 21, 5)),  
    'min_samples_split': [2, 5, 8,10],            
    'min_samples_leaf': [4, 5, 8, 10],       
    'max_features': ['sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=RAND)

grid_cv = RandomizedSearchCV(rf,
                       param_distributions=grid,
                       scoring='roc_auc',
                       cv=cv,
                       verbose=0,
                       n_jobs=-1)
grid_cv.fit(X_train, y_train)

In [28]:
best_params = grid_cv.best_params_

3. Обучим RandomForest с подобранными параметрами на HoldOut:

In [29]:
score_oof = []
finish_test_preds = []
finish_test_preds_proba = []

skf = StratifiedKFold(n_splits=N_FOLD, random_state=RAND, shuffle=True)
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    model = RandomForestClassifier(**best_params, random_state=RAND) 

    model.fit(X_train_, y_train_)

    # oof
    preds_val = model.predict(X_val)
    score_oof.append(f1_score(y_val, preds_val))

    print("Fold:", fold + 1, "F1-SCORE Validation %.3f" % f1_score(y_val, preds_val))
    print("---")

    # holdout
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)[:, 1]

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    meta_X.loc[X_val.index, 'rf_best'] = model.predict_proba(X_val)[:, 1] 

print('F1-SCORE mean oof: %.3f, std: %.3f' % (np.mean(score_oof), np.std(score_oof)))

meta_X_test['rf_best'] = np.mean(finish_test_preds_proba, axis=0)

Fold: 1 F1-SCORE Validation 0.722
---
Fold: 2 F1-SCORE Validation 0.716
---
Fold: 3 F1-SCORE Validation 0.716
---
Fold: 4 F1-SCORE Validation 0.720
---
Fold: 5 F1-SCORE Validation 0.731
---
F1-SCORE mean oof: 0.721, std: 0.005


4. Получим **предсказанные значения на Holdout** данных при помощи усреднения значений (для вероятностй) и моды (для меток классов), полученных при обучении на фолдах и выведем основные метрики.

In [30]:
# Находим моду по классам
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Находим среднее по вероятностям
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

# Вычисляем F1-макро для валидации (OOF)
print('F1-macro mean OOF: %.3f, std: %.3f' %
      (np.mean(score_oof), np.std(score_oof)))

# Вычисляем F1-макро для тестового набора
f1_test = f1_score(y_test, test_pred, average='macro')
print('F1-macro HOLDOUT: %.3f' % f1_test)

F1-macro mean OOF: 0.721, std: 0.005
F1-macro HOLDOUT: 0.718


In [31]:
df_metrics = pd.concat([df_metrics, get_metrics_classification(y_test, 
                                        test_pred, 
                                        test_pred_proba,
                                        "Catboost best Holdout")], axis=0)
df_metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Logloss,Precision,Recall,f1
0,Random Forest baseline Holdout,0.723859,0.819073,0.499654,0.739712,0.686724,0.712234
1,Catboost baseline Holdout,0.721008,0.823945,0.494296,0.737603,0.681948,0.708685
2,LightGBM baseline Holdout,0.712928,0.820036,0.497216,0.72579,0.680038,0.70217
0,Catboost best Holdout,0.716255,0.824472,0.492006,0.732919,0.676218,0.703428
0,Catboost best Holdout,0.718156,0.821629,0.498345,0.732582,0.682904,0.706871


In [32]:
check_overfitting_classification(model,
                  X_train,
                  y_train,
                  X_test,
                  y_test)

ROC-AUC train: 0.897
ROC-AUC test: 0.821
delta = 9.3 %


(0.8972430586077222, 0.820987838388548, 9.288227748762944)

## LightGBM Random Search

In [33]:
lgbm = LGBMClassifier(objective='binary',
                       random_state=RAND,
                       verbose=0)

lgbm.fit(X_train, y_train, eval_metric='logloss')

1. Сначала выберем оптимальное количество деревьев.

In [34]:
grid = {
    'n_estimators': [i for i in range(100, 3101, 500)]
}

cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=RAND)

grid_cv = RandomizedSearchCV(lgbm,
                       param_distributions=grid,
                       scoring='neg_log_loss',
                       cv=cv,
                       verbose=0,
                       n_jobs=-1)
grid_cv.fit(X_train, y_train)

In [35]:
grid_cv.best_params_

{'n_estimators': 100}

2. Подберем скорость обучения.

In [36]:
grid = {
    'n_estimators': [100],
    "learning_rate": np.logspace(-4, -1, 10)
}

cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=RAND)

grid_cv = RandomizedSearchCV(lgbm,
                       param_distributions=grid,
                       scoring='neg_log_loss',
                       cv=cv,
                       verbose=0,
                       n_jobs=-1)
grid_cv.fit(X_train, y_train)

In [37]:
grid_cv.best_params_

{'n_estimators': 100, 'learning_rate': 0.046415888336127774}

3. Подберем остальные параметры.

In [38]:
grid = {
    'n_estimators': [100],
    'learning_rate': [0.046], 
    'max_depth': range(3, 12), 
    'num_leaves': range(5, 150, 5), 
    'min_child_samples': range(5, 100, 5), 
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1]
}

cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=RAND)

grid_cv = RandomizedSearchCV(lgbm,
                       param_distributions=grid,
                       scoring='neg_log_loss',
                       cv=cv,
                       verbose=0,
                       n_jobs=-1)
grid_cv.fit(X_train, y_train)



In [39]:
grid_cv.best_params_

{'subsample': 0.8,
 'num_leaves': 70,
 'n_estimators': 100,
 'min_child_samples': 60,
 'max_depth': 7,
 'learning_rate': 0.046,
 'colsample_bytree': 0.9}

In [40]:
best_params = grid_cv.best_params_

4. Обучим LightGBM с подобранными параметрами на HoldOut:

In [41]:
skf = StratifiedKFold(n_splits=N_FOLD, random_state=RAND, shuffle=True)
score_oof = []
finish_test_preds = []
finish_test_preds_proba = []
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    model = LGBMClassifier(**best_params, objective='binary',
                           random_state=RAND,
                           verbose=0)

    model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], eval_metric='logloss') 

    # oof
    preds_val = model.predict(X_val)
    score_oof.append(f1_score(y_val, preds_val))

    print("Fold:", fold + 1, "F1-SCORE Validation %.3f" % f1_score(y_val, preds_val))
    print("---")

    # holdout
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)[:, 1]

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    meta_X.loc[X_val.index, 'lgbm_best'] = model.predict_proba(X_val)[:, 1] 

print('F1-SCORE mean oof: %.3f, std: %.3f' % (np.mean(score_oof), np.std(score_oof)))

meta_X_test['lgbm_best'] = np.mean(finish_test_preds_proba, axis=0)

Fold: 1 F1-SCORE Validation 0.724
---
Fold: 2 F1-SCORE Validation 0.722
---
Fold: 3 F1-SCORE Validation 0.703
---
Fold: 4 F1-SCORE Validation 0.723
---
Fold: 5 F1-SCORE Validation 0.728
---
F1-SCORE mean oof: 0.720, std: 0.009


5. Получим **предсказанные значения на Holdout** данных при помощи усреднения значений (для вероятностй) и моды (для меток классов), полученных при обучении на фолдах и выведем основные метрики.

In [42]:
# Находим моду по классам
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Находим среднее по вероятностям
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

# Вычисляем F1-макро для валидации (OOF)
print('F1-macro mean OOF: %.3f, std: %.3f' %
      (np.mean(score_oof), np.std(score_oof)))

# Вычисляем F1-макро для тестового набора
f1_test = f1_score(y_test, test_pred, average='macro')
print('F1-macro HOLDOUT: %.3f' % f1_test)

F1-macro mean OOF: 0.720, std: 0.009
F1-macro HOLDOUT: 0.714


In [43]:
df_metrics = pd.concat([df_metrics, get_metrics_classification(y_test, 
                                        test_pred, 
                                        test_pred_proba,
                                        "LightGBM best Holdout")], axis=0)
df_metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Logloss,Precision,Recall,f1
0,Random Forest baseline Holdout,0.723859,0.819073,0.499654,0.739712,0.686724,0.712234
1,Catboost baseline Holdout,0.721008,0.823945,0.494296,0.737603,0.681948,0.708685
2,LightGBM baseline Holdout,0.712928,0.820036,0.497216,0.72579,0.680038,0.70217
0,Catboost best Holdout,0.716255,0.824472,0.492006,0.732919,0.676218,0.703428
0,Catboost best Holdout,0.718156,0.821629,0.498345,0.732582,0.682904,0.706871
0,LightGBM best Holdout,0.714829,0.821686,0.49406,0.729231,0.679083,0.703264


6. Проверим на переобучение

In [44]:
check_overfitting_classification(model,
                  X_train,
                  y_train,
                  X_test,
                  y_test)

ROC-AUC train: 0.866
ROC-AUC test: 0.816
delta = 6.1 %


(0.8656171610352132, 0.8162082229806475, 6.0534722223372155)

In [51]:
meta_X_test

Unnamed: 0_level_0,rf_01,catboost_01,lgbm_01,catboost_best_01,rf_best,lgbm_best,index
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004,0.316,0.397284,0.380385,0.402477,0.378933,0.382387,2004
5616,0.044,0.065483,0.048569,0.061619,0.051262,0.056927,5616
2635,0.214,0.338947,0.179918,0.321341,0.312210,0.334825,2635
7907,0.020,0.039754,0.023750,0.030149,0.039381,0.039585,7907
1667,0.946,0.947815,0.994234,0.968427,0.919239,0.984545,1667
...,...,...,...,...,...,...,...
8801,0.434,0.393050,0.436554,0.389904,0.437460,0.439529,8801
9389,0.454,0.459614,0.451510,0.462763,0.486185,0.467125,9389
4600,0.102,0.078093,0.073371,0.076879,0.080716,0.074021,4600
7297,0.072,0.094033,0.118452,0.069409,0.104479,0.122315,7297


In [46]:
meta_X['index'] = meta_X.index

In [47]:
meta_X_test['index'] = meta_X_test.index

In [48]:
meta_X.to_csv('meta_X_final.csv', index=False)

In [49]:
meta_X_test.to_csv('meta_X_test_final.csv', index=False)

In [50]:
df_metrics.to_csv('models_metrics_final.csv', index=False)