In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import time

import catboost as cb
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

# funсtion

In [2]:
def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    if categorical != None:
        X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [3]:
# CatBoost
def catboost_base(x, x_val, y, y_val, cat_feature = None):
    cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.001,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": False,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
    }
    if cat_feature == None:
        model = cb.CatBoostClassifier(**cb_params)
        model.fit(x, y,
            )
    else:
        model = cb.CatBoostClassifier(**cb_params)
        model.fit(x, y,cat_features=cat_feature
            )
    y_pred = model.predict(x_val)
    print("roc_auc_score: ",roc_auc_score(y_val, y_pred))
    print("confusion_matrix: ",confusion_matrix(y_val, y_pred))
    print("accuracy_score: ",accuracy_score(y_val, y_pred))
    print("classification_report: ", classification_report(y_val, y_pred))
    return y_pred

In [4]:
def train_split(x):
    x_train, x_valid, y_train, y_valid = train_test_split(x.drop( "target", axis=1), x["target"], test_size = 0.2)
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2)
    print(x_train.shape, x_valid.shape, y_train.shape, y_valid.shape, x_test.shape, y_test.shape)
    return x_train, x_valid, y_train, y_valid, x_test, y_test

In [5]:
def train_split_dis(x):
    for i in range(10000000):
        x_train, x_valid, y_train, y_valid = train_test_split(x.drop( "target", axis=1), x["target"], test_size = 0.3,
                                                             random_state = i)
        x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2)
        if (y_train == 1).sum()/181 < 0.7:
            print("random_state: {}".format(i))
            print(x_train.shape, x_valid.shape, y_train.shape, y_valid.shape, x_test.shape, y_test.shape)
            return x_train, x_valid, y_train, y_valid, x_test, y_test


# train

In [6]:
os.listdir("data")

['data_kmeans.csv',
 'data_kmeans_replace.csv',
 'data_n.csv',
 'test',
 'test.txt',
 'UnlabeledWiDS2021.csv']

In [7]:
data = pd.read_csv("data/data_kmeans_replace.csv")

In [8]:
data.shape

(10234, 190)

In [9]:
for i in data:
    if data[i].isna().sum() != 0:
        print(i)
        print(data[i].isna().sum())

In [14]:
x_train, x_valid, y_train, y_valid, x_test, y_test = train_split_dis(data)

random_state: 0
(5730, 189) (3071, 189) (5730,) (3071,) (1433, 189) (1433,)


In [15]:
# class_weights

# CatBoost
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.001,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": False,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42,
    "class_weights":(1,56)
    }



In [16]:
model_cb_w = cb.CatBoostClassifier(**cb_params)
model_cb_w.fit(x_train, 
               y_train,
#                cat_features=categorical, 
               eval_set = [(x_train, y_train), (x_valid, y_valid)], 
               plot= True)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x24da7151248>

In [17]:
pred = model_cb_w.predict(x_test)
print(classification_report(y_test, pred, labels=np.unique(pred)))
print("roc_auc_score: ",roc_auc_score(y_test, pred))
print("confusion_matrix: ",confusion_matrix(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1404
           1       0.35      0.45      0.39        29

    accuracy                           0.97      1433
   macro avg       0.67      0.72      0.69      1433
weighted avg       0.98      0.97      0.97      1433

roc_auc_score:  0.7155909224874742
confusion_matrix:  [[1380   24]
 [  16   13]]


# kfold

In [18]:
X = data.drop("target", axis=1)
y= data["target"]

In [19]:
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42,
    "class_weights":(1,56)
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=X, y=y, cv=cv
)

Tue Sep  5 21:41:01 2023, Cross-Validation, 10234 rows, 189 cols
0:	test: 0.8039343	test1: 0.6994688	best: 0.6994688 (0)	total: 12ms	remaining: 23.9s
10:	test: 0.9453013	test1: 0.7792179	best: 0.7812900 (2)	total: 126ms	remaining: 22.7s
20:	test: 0.9601377	test1: 0.8061956	best: 0.8061956 (20)	total: 231ms	remaining: 21.8s
30:	test: 0.9645266	test1: 0.8189964	best: 0.8189964 (30)	total: 333ms	remaining: 21.1s
40:	test: 0.9644179	test1: 0.8350454	best: 0.8350454 (40)	total: 438ms	remaining: 20.9s
50:	test: 0.9675144	test1: 0.8407578	best: 0.8407578 (50)	total: 533ms	remaining: 20.4s
60:	test: 0.9690042	test1: 0.8436540	best: 0.8436540 (60)	total: 630ms	remaining: 20s
70:	test: 0.9733201	test1: 0.8501184	best: 0.8501184 (70)	total: 730ms	remaining: 19.8s
80:	test: 0.9764945	test1: 0.8538946	best: 0.8538946 (80)	total: 827ms	remaining: 19.6s
90:	test: 0.9797047	test1: 0.8498144	best: 0.8550947 (81)	total: 925ms	remaining: 19.4s
100:	test: 0.9814442	test1: 0.8453821	best: 0.8550947 (81)	to

In [20]:
for num,i in enumerate(estimators):
    print(num)
    pred = i.predict(x_test)
    print(classification_report(y_test, pred, labels=np.unique(pred)))
    print("roc_auc_score: ",roc_auc_score(y_test, pred))
    print("confusion_matrix: ",confusion_matrix(y_test, pred))

0
              precision    recall  f1-score   support

           0       1.00      0.95      0.97      1404
           1       0.25      0.79      0.38        29

    accuracy                           0.95      1433
   macro avg       0.62      0.87      0.68      1433
weighted avg       0.98      0.95      0.96      1433

roc_auc_score:  0.8719790745652815
confusion_matrix:  [[1335   69]
 [   6   23]]
1
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      1404
           1       0.31      0.79      0.44        29

    accuracy                           0.96      1433
   macro avg       0.65      0.88      0.71      1433
weighted avg       0.98      0.96      0.97      1433

roc_auc_score:  0.8780332056194127
confusion_matrix:  [[1352   52]
 [   6   23]]
2
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      1404
           1       0.24      0.76      0.36        29

    accuracy  

# Байесовская оптимизация

In [22]:
from skopt import BayesSearchCV

In [23]:
import skopt
skopt.__version__

'0.9.0'

In [24]:
bayes_cv_tuner = BayesSearchCV(
    estimator = cb.CatBoostClassifier(
    silent=True
    ),
    search_spaces = {
    "max_depth": (3, 15),
    "subsample" : (0.3, 0.7),
    "leaf_estimation_iterations": (5, 50),
    "colsample_bylevel": (0.3, 0.9),
    "l2_leaf_reg": (2, 500),
    "learning_rate": (0.0001, 0.01),
    "iterations": (100, 500)
    },
    cv=5,
    scoring = 'roc_auc',
    n_jobs = 1,
#     n_iter = 100,
    verbose = 1,
    refit = True,
    random_state = 72
    )

In [27]:
def status_print(optim_result):
#     print(optim_result)
    """Status callback durring bayesian hyperparameter search"""

    # Get all the models tested so far in DataFrame format
#     all_models = pd.DataFrame(optim_result.specs["base_estimator"])    

    # Get current parameters and the best parameters    
#     best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Best ROC-AUC: {}'.format(
#         all_models,
        np.round(bayes_cv_tuner.best_score_, 4),
#         bayes_cv_tuner.best_params_
    ))

In [None]:
resultCAT = bayes_cv_tuner.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
