In [47]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier, Dataset, early_stopping
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from get_metrics1 import get_metrics_classification
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import optuna
from SupFunctions import check_overfitting_classification, scale_pos_weight_calc
import warnings
from warnings import simplefilter, filterwarnings
warnings.filterwarnings("ignore")
simplefilter("ignore", category=RuntimeWarning)
filterwarnings("ignore", category=UserWarning) 
RAND = 10
N_FOLD = 5


In [2]:
df_credit = pd.read_csv('df_credit.csv')

In [67]:
df_metrics = pd.read_csv('models_metrics.csv')

In [68]:
meta_X = pd.read_csv('meta_X_baseline.csv')

In [69]:
meta_X_test = pd.read_csv('meta_X_test_baseline.csv')

In [70]:
meta_X = meta_X.set_index('index')

In [71]:
meta_X_test = meta_X_test.set_index('index')

In [11]:
df_credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44961 entries, 0 to 44960
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   person_age                      44961 non-null  float64 
 1   person_gender                   44961 non-null  category
 2   person_education                44961 non-null  category
 3   person_income                   44961 non-null  float64 
 4   person_emp_exp                  44961 non-null  int64   
 5   person_home_ownership           44961 non-null  category
 6   loan_amnt                       44961 non-null  float64 
 7   loan_intent                     44961 non-null  category
 8   loan_int_rate                   44961 non-null  float64 
 9   loan_percent_income             44961 non-null  float64 
 10  cb_person_cred_hist_length      44961 non-null  float64 
 11  credit_score                    44961 non-null  int64   
 12  previous_loan_defa

In [9]:
df_credit = df_credit.astype({col: 'category' for col in df_credit.select_dtypes(include='object').columns})

In [12]:
X = df_credit.drop(['loan_status'], axis=1)
y = df_credit['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    stratify=y, 
                                                    random_state=RAND)

In [13]:
cat_features = list(df_credit.select_dtypes(include=['category']).columns)

1. Сначала выберем оптимальное количество деревьев.

In [14]:
lgbm = LGBMClassifier(objective='binary',
                       random_state=RAND,
                       verbose=0)

lgbm.fit(X_train, y_train, eval_metric='logloss')

In [19]:
grid = {
    'n_estimators': [i for i in range(100, 800, 50)]
}

cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=RAND)

grid_cv = RandomizedSearchCV(lgbm,
                       param_distributions=grid,
                       scoring='neg_log_loss',
                       cv=cv,
                       verbose=0,
                       n_jobs=-1)
grid_cv.fit(X_train, y_train)

In [20]:
grid_cv.best_params_

{'n_estimators': 250}

2. Подберем скорость обучения.

In [41]:
def objective_lgb(trial, X, y, N_FOLDS, random_state):
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=random_state)
    cv_predicts = np.empty(N_FOLDS)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        lgb_params = {
            "random_state": random_state,
            "verbosity": -1,
            "boosting_type": "gbdt",
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
            "num_leaves": trial.suggest_int("num_leaves", 20, 1000, step=20),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_child_samples": trial.suggest_int("min_child_samples", 100, 70000, step=100),
            "reg_alpha": trial.suggest_int("reg_alpha", 0, 100),
            "reg_lambda": trial.suggest_int("reg_lambda", 0, 100),
            "min_split_gain": trial.suggest_int("min_split_gain", 0, 20),
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            "subsample_freq": trial.suggest_categorical("subsample_freq", [1]),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
            "random_state": trial.suggest_categorical("random_state", [random_state]),
            "is_unbalance": trial.suggest_categorical("is_unbalance", [True])
    }
        
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
        model = LGBMClassifier(verbose=-1, **lgb_params)
        
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            callbacks=[pruning_callback, early_stopping(stopping_rounds=100)]
        )
        
        preds = model.predict_proba(X_test)[:, 1]
        cv_predicts[idx] = roc_auc_score(y_test, preds)
    
    return np.mean(cv_predicts)

In [42]:
study_lgb = optuna.create_study(direction="maximize", study_name="LGB_01")
func = lambda trial: objective_lgb(
    trial, X_train, y_train, N_FOLDS=N_FOLD, random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.ERROR)
study_lgb.optimize(func, n_trials=20, show_progress_bar=True)

  0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.529764
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.529764
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.529938
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.529803
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.529803
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.529764
Training until validation scores don't improve for 100 rounds
Early st

In [43]:
study_lgb.best_params

{'n_estimators': 312,
 'learning_rate': 0.030152305383741834,
 'num_leaves': 580,
 'max_depth': 12,
 'min_child_samples': 500,
 'reg_alpha': 42,
 'reg_lambda': 53,
 'min_split_gain': 14,
 'subsample': 0.8533106326801126,
 'subsample_freq': 1,
 'colsample_bytree': 0.8000931922087815,
 'random_state': 10,
 'is_unbalance': True}

In [72]:
skf = StratifiedKFold(n_splits=N_FOLD, random_state=RAND, shuffle=True)
score_oof = []
finish_test_preds = []
finish_test_preds_proba = []
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    lgbm_base = LGBMClassifier(**study_lgb.best_params, objective='binary', verbose=-1)
    model = CalibratedClassifierCV(estimator=lgbm_base, method='isotonic', cv=3)

    model.fit(X_train_, y_train_, 
              eval_set=[(X_val, y_val)], 
              eval_metric='logloss'
              ) 

    # oof
    preds_val = model.predict(X_val)
    score_oof.append(f1_score(y_val, preds_val))

    print("Fold:", fold + 1, "F1-SCORE Validation %.3f" % f1_score(y_val, preds_val))
    print("---")

    # holdout
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)[:, 1]

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    meta_X.loc[X_val.index, 'lgbm_optuna'] = model.predict_proba(X_val)[:, 1] 

print('F1-SCORE mean oof: %.3f, std: %.3f' % (np.mean(score_oof), np.std(score_oof)))

meta_X_test['lgbm_optuna'] = np.mean(finish_test_preds_proba, axis=0)

Fold: 1 F1-SCORE Validation 0.798
---
Fold: 2 F1-SCORE Validation 0.796
---
Fold: 3 F1-SCORE Validation 0.794
---
Fold: 4 F1-SCORE Validation 0.794
---
Fold: 5 F1-SCORE Validation 0.789
---
F1-SCORE mean oof: 0.794, std: 0.003


In [73]:
# Находим моду по классам
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Находим среднее по вероятностям
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

# Вычисляем F1-макро для валидации (OOF)
print('F1-macro mean OOF: %.3f, std: %.3f' %
      (np.mean(score_oof), np.std(score_oof)))

# Вычисляем F1-макро для тестового набора
f1_test = f1_score(y_test, test_pred, average='macro')
print('F1-macro HOLDOUT: %.3f' % f1_test)

F1-macro mean OOF: 0.794, std: 0.003
F1-macro HOLDOUT: 0.880


In [74]:
df_metrics = pd.concat([df_metrics, get_metrics_classification(y_test, 
                                        test_pred, 
                                        test_pred_proba,
                                        "LightGBM best_params optuna Holdout")], axis=0)
df_metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Logloss,Precision,Recall,f1
0,Catboost baseline Holdout,0.93039,0.977312,0.153096,0.889835,0.783892,0.833511
1,LightGBM baseline Holdout,0.932948,0.978504,0.149401,0.903468,0.781891,0.838294
0,LightGBM best_params optuna Holdout,0.921383,0.969897,0.177029,0.881346,0.746873,0.808557


In [75]:
check_overfitting_classification(model,
                  X_train,
                  y_train,
                  X_test,
                  y_test)

ROC-AUC train: 0.967
ROC-AUC test: 0.970
delta = 0.3 %


(0.9671885224036298, 0.9699187240174276, 0.2814876696564032)

Метрики немного упали, при подобранных параметрах, но переобучение снизилось с 0,9% до 0,3%

1. Сначала выберем оптимальное количество деревьев и скорость обучения.

In [85]:
base_lgb = LGBMClassifier(objective='binary', random_state=10, verbose=-1)
cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=RAND)
calibrated_lgb = CalibratedClassifierCV(base_lgb, cv=cv)

In [87]:
param_grid = {
    'estimator__n_estimators': range(100, 500, 50),
    'estimator__learning_rate': np.logspace(-4, -1, 10),
    'method': ['isotonic']
}

grid_cv = RandomizedSearchCV(
    calibrated_lgb,
    param_distributions=param_grid,
    scoring='neg_log_loss',
    cv=cv,
    n_jobs=-1,
    verbose=0
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid_cv.fit(X_train, y_train)

In [88]:
grid_cv.best_params_

{'method': 'isotonic',
 'estimator__n_estimators': 450,
 'estimator__learning_rate': 0.1}

3. Подберем остальные параметры.

In [91]:
param_grid = {
    'estimator__n_estimators': [450],
    'estimator__learning_rate': [0.1], 
    'estimator__max_depth': range(3, 12), 
    'estimator__num_leaves': range(5, 150, 5), 
    'estimator__min_child_samples': range(5, 100, 5), 
    'estimator__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1],
    'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1],
    'estimator__reg_alpha': np.logspace(-3, 1, 5),
    'estimator__reg_lambda': np.logspace(-3, 1, 5)
}

cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=RAND)

calibrated_lgb = CalibratedClassifierCV(base_lgb, method='isotonic')

grid_cv = RandomizedSearchCV(
    calibrated_lgb,
    param_distributions=param_grid,
    scoring='neg_log_loss',
    cv=cv,
    verbose=0,
    n_jobs=-1
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid_cv.fit(X_train, y_train)

In [92]:
grid_cv.best_params_

{'estimator__subsample': 0.9,
 'estimator__reg_lambda': 0.001,
 'estimator__reg_alpha': 1.0,
 'estimator__num_leaves': 50,
 'estimator__n_estimators': 450,
 'estimator__min_child_samples': 60,
 'estimator__max_depth': 5,
 'estimator__learning_rate': 0.1,
 'estimator__colsample_bytree': 0.6}

In [94]:
skf = StratifiedKFold(n_splits=N_FOLD, random_state=RAND, shuffle=True)
score_oof = []
finish_test_preds = []
finish_test_preds_proba = []

for fold, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    # Разделение на train/val внутри обучающей выборки
    X_train_, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Создаем и обучаем модель
    lgbm = LGBMClassifier(**grid_cv.best_params_,
                          objective='binary',
                          verbosity=-1)    
    model = CalibratedClassifierCV(estimator=lgbm, method='isotonic', cv=skf)

    model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], eval_metric='logloss')
    
    # Предсказания на валидации
    preds_val = model.predict(X_val)
    score_oof.append(f1_score(y_val, preds_val))
    
    print(f"Fold: {fold + 1} F1-SCORE Validation {f1_score(y_val, preds_val):.3f}")
    print("---")
    
    # Предсказания на тесте
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)[:, 1]
    
    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)
    
    meta_X.loc[X_val.index, 'lgbm_RS'] = model.predict_proba(X_val)[:, 1]

print(f'F1-SCORE mean oof: {np.mean(score_oof):.3f}, std: {np.std(score_oof):.3f}')
meta_X_test['lgbm_RS'] = np.mean(finish_test_preds_proba, axis=0) 

Fold: 1 F1-SCORE Validation 0.838
---
Fold: 2 F1-SCORE Validation 0.834
---
Fold: 3 F1-SCORE Validation 0.835
---
Fold: 4 F1-SCORE Validation 0.828
---
Fold: 5 F1-SCORE Validation 0.837
---
F1-SCORE mean oof: 0.834, std: 0.003


In [95]:
# Находим моду по классам
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Находим среднее по вероятностям
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

# Вычисляем F1-макро для валидации (OOF)
print('F1-macro mean OOF: %.3f, std: %.3f' %
      (np.mean(score_oof), np.std(score_oof)))

# Вычисляем F1-макро для тестового набора
f1_test = f1_score(y_test, test_pred, average='macro')
print('F1-macro HOLDOUT: %.3f' % f1_test)

F1-macro mean OOF: 0.834, std: 0.003
F1-macro HOLDOUT: 0.900


In [96]:
df_metrics = pd.concat([df_metrics, get_metrics_classification(y_test, 
                                        test_pred, 
                                        test_pred_proba,
                                        "LightGBM best_params RS Holdout")], axis=0)
df_metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Logloss,Precision,Recall,f1
0,Catboost baseline Holdout,0.93039,0.977312,0.153096,0.889835,0.783892,0.833511
1,LightGBM baseline Holdout,0.932948,0.978504,0.149401,0.903468,0.781891,0.838294
0,LightGBM best_params optuna Holdout,0.921383,0.969897,0.177029,0.881346,0.746873,0.808557
0,LightGBM best_params RS Holdout,0.934282,0.979919,0.144769,0.906936,0.784892,0.841512


In [98]:
check_overfitting_classification(model,
                  X_train,
                  y_train,
                  X_test,
                  y_test)

ROC-AUC train: 0.987
ROC-AUC test: 0.980
delta = 0.8 %


(0.9871923149655766, 0.9797189486936777, 0.7628071583042865)

## Catboost Random Search

1. Сначала выберем оптимальное количество деревьев.

In [101]:
grid = {
    "n_estimators": [i for i in range(100, 1500, 50)]
}
clf_grid = CatBoostClassifier(allow_writing_files=False,
                            random_state=RAND,
                            eval_metric="AUC",
                            cat_features=cat_features,
                            scale_pos_weight=scale_pos_weight_calc(y_train),
                            verbose=0)
grid_search_result = clf_grid.randomized_search(grid,
                                             X=X_train,
                                             y=y_train,
                                             plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 0.9775653251
bestIteration = 1417

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.9775653	best: 0.9775653 (0)	total: 29.9s	remaining: 4m 28s

bestTest = 0.9698189515
bestIteration = 149

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.9698190	best: 0.9775653 (0)	total: 32.3s	remaining: 2m 9s

bestTest = 0.9707248734
bestIteration = 199

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
2:	loss: 0.9707249	best: 0.9775653 (0)	total: 36.6s	remaining: 1m 25s

bestTest = 0.9738579428
bestIteration = 449

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
3:	loss: 0.9738579	best: 0.9775653 (0)	total: 45.4s	remaining: 1m 8s



In [102]:
grid_search_result['params']

{'iterations': 1450}

2. Затем под это количество выберем скорость обучения.

In [103]:
grid = {
    "n_estimators": [1450],
     "learning_rate": np.logspace(-4, -1, 4)
}
clf_grid = CatBoostClassifier(allow_writing_files=False,
                            random_state=RAND,
                            eval_metric="AUC",
                            cat_features=cat_features,
                            scale_pos_weight=scale_pos_weight_calc(y_train),
                            verbose=0)
grid_search_result = clf_grid.randomized_search(grid,
                                             X=X_train,
                                             y=y_train,
                                             plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 0.9579734486
bestIteration = 1449

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.9579734	best: 0.9579734 (0)	total: 29.4s	remaining: 1m 28s

bestTest = 0.9665378587
bestIteration = 1449

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.9665379	best: 0.9665379 (1)	total: 59.5s	remaining: 59.5s

bestTest = 0.9743990722
bestIteration = 1449

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
2:	loss: 0.9743991	best: 0.9743991 (2)	total: 1m 28s	remaining: 29.4s

bestTest = 0.9778861748
bestIteration = 1129

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
3:	loss: 0.9778862	best: 0.9778862 (3)	total: 1m 58s	remaining: 0us

In [104]:
grid_search_result['params']

{'learning_rate': 0.1, 'iterations': 1450}

3. Подберем остальные параметры.

In [105]:
grid = {
     "n_estimators": [1450],
     "learning_rate": [0.1],
     "boosting_type" : ['Ordered', 'Plain'],
     "max_depth": list(range(3, 15)),
     "l2_leaf_reg": np.logspace(-7, 2, 7),
     "bootstrap_type": ["Bayesian", "Bernoulli", "MVS", "No"],
     'border_count': [128, 254],
     'grow_policy': ["SymmetricTree", "Depthwise", "Lossguide"]
}
clf_grid = CatBoostClassifier(allow_writing_files=False,
                            random_state=RAND,
                            eval_metric="AUC",
                            cat_features=cat_features,
                            scale_pos_weight=scale_pos_weight_calc(y_train),
                            verbose=0)

grid_search_result = clf_grid.randomized_search(grid,
                                                X=X_train,
                                                y=y_train,
                                                plot=False)



bestTest = 0.9734352774
bestIteration = 1434

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.9734353	best: 0.9734353 (0)	total: 36.4s	remaining: 5m 27s

bestTest = 0.9703680088
bestIteration = 812

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.9703680	best: 0.9734353 (0)	total: 1m 26s	remaining: 5m 45s

bestTest = 0.9757667322
bestIteration = 1036

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
2:	loss: 0.9757667	best: 0.9757667 (2)	total: 2m 48s	remaining: 6m 32s

bestTest = 0.9761473802
bestIteration = 1142

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
3:	loss: 0.9761474	best: 0.9761474 (3)	total: 14m 23s	remaining: 

In [106]:
best_params = grid_search_result['params']

In [107]:
best_params

{'border_count': 254,
 'bootstrap_type': 'No',
 'boosting_type': 'Plain',
 'iterations': 1450,
 'l2_leaf_reg': 100,
 'grow_policy': 'Depthwise',
 'depth': 8,
 'learning_rate': 0.1}

4. Обучим Catboost с подобранными параметрами на HoldOut:

In [110]:
skf = StratifiedKFold(n_splits=N_FOLD, random_state=RAND, shuffle=True)
score_oof = []
finish_test_preds = []
finish_test_preds_proba = []
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

    model = CatBoostClassifier(**best_params, cat_features=cat_features, random_state=RAND, verbose=0) 

    model.fit(X_train_, y_train_, eval_set=(X_val, y_val))

    # oof
    preds_val = model.predict(X_val)
    score_oof.append(f1_score(y_val, preds_val))

    print("Fold:", fold + 1, "F1-SCORE Validation %.3f" % f1_score(y_val, preds_val))
    print("---")

    # holdout
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)[:, 1]

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    meta_X.loc[X_val.index, 'catboost_RS'] = model.predict_proba(X_val)[:, 1]  

print('F1-SCORE mean oof: %.3f, std: %.3f' % (np.mean(score_oof), np.std(score_oof)))

meta_X_test['catboost_RS'] = np.mean(finish_test_preds_proba, axis=0) 

Fold: 1 F1-SCORE Validation 0.846
---
Fold: 2 F1-SCORE Validation 0.846
---
Fold: 3 F1-SCORE Validation 0.846
---
Fold: 4 F1-SCORE Validation 0.836
---
Fold: 5 F1-SCORE Validation 0.845
---
F1-SCORE mean oof: 0.844, std: 0.004


5. Получим **предсказанные значения на Holdout** данных при помощи усреднения значений (для вероятностй) и моды (для меток классов), полученных при обучении на фолдах и выведем основные метрики.

In [111]:
# Находим моду по классам
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Находим среднее по вероятностям
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

# Вычисляем F1-макро для валидации (OOF)
print('F1-macro mean OOF: %.3f, std: %.3f' %
      (np.mean(score_oof), np.std(score_oof)))

# Вычисляем F1-макро для тестового набора
f1_test = f1_score(y_test, test_pred, average='macro')
print('F1-macro HOLDOUT: %.3f' % f1_test)

F1-macro mean OOF: 0.844, std: 0.004
F1-macro HOLDOUT: 0.909


In [112]:
df_metrics = pd.concat([df_metrics, get_metrics_classification(y_test, 
                                        test_pred, 
                                        test_pred_proba,
                                        "Catboost RS Holdout")], axis=0)
df_metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Logloss,Precision,Recall,f1
0,Catboost baseline Holdout,0.93039,0.977312,0.153096,0.889835,0.783892,0.833511
1,LightGBM baseline Holdout,0.932948,0.978504,0.149401,0.903468,0.781891,0.838294
0,LightGBM best_params optuna Holdout,0.921383,0.969897,0.177029,0.881346,0.746873,0.808557
0,LightGBM best_params RS Holdout,0.934282,0.979919,0.144769,0.906936,0.784892,0.841512
0,Catboost RS Holdout,0.939175,0.980601,0.142082,0.902886,0.813907,0.856091


6. Проверим переобучение.

In [113]:
check_overfitting_classification(model,
                  X_train,
                  y_train,
                  X_test,
                  y_test)

ROC-AUC train: 0.990
ROC-AUC test: 0.979
delta = 1.1 %


(0.9897135933904319, 0.9789214023654664, 1.1024573575454748)

In [117]:
meta_X['index'] = meta_X.index

In [120]:
meta_X_test['index'] = meta_X_test.index

In [121]:
meta_X.to_csv('meta_X_final.csv', index=False)

In [122]:
meta_X_test.to_csv('meta_X_test_final.csv', index=False)

In [123]:
df_metrics.to_csv('models_metrics_final.csv', index=False)