In [49]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, classification_report

import catboost as cb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import time

In [50]:
os.listdir("data")

['medical_examination.csv']

In [51]:
data = pd.read_csv('data/medical_examination.csv')

In [52]:
data["age"] = data["age"].apply(lambda x: round(x/365, 1))

In [53]:
data["sex"].loc[data["sex"] == 3] = data["sex"].median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [54]:
data["sex"].nunique()

2

In [55]:
"""делю данные на 80% обучени и 20% тест, такое деление нужно для KFold"""
gr = data.shape[0] * 0.8
X = data[: int(gr)]
x_test = data[int(gr) :]
y = X["cardio"]
X = X.drop("cardio", axis=1)
y_test = x_test["cardio"]
x_test = x_test.drop("cardio", axis=1)
X.shape,y.shape, x_test.shape, y_test.shape

((56000, 12), (56000,), (14000, 12), (14000,))

In [56]:
def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    if categorical != None:
        X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [57]:
cb_params = {
    "n_estimators": 2000,
    'subsample': 0.7,
    'max_depth': 6,
    'leaf_estimation_iterations': 50,
    'l2_leaf_reg': 2,
    "learning_rate": 0.008826816129789937,
    'colsample_bylevel': 0.3,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": False,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42,
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=X, y=y, cv=cv
)

Mon Sep 11 03:10:58 2023, Cross-Validation, 56000 rows, 12 cols
Fold 1, Valid score = 0.80575
Fold 2, Valid score = 0.81345
Fold 3, Valid score = 0.80165
Fold 4, Valid score = 0.79714
Fold 5, Valid score = 0.80218
Score by each fold: [0.80575, 0.81345, 0.80165, 0.79714, 0.80218]


In [58]:
result = pd.DataFrame(columns=["model","rocauc","accuracy"])
oof_best_model = pd.DataFrame()

In [59]:
best_rocauc = 0
for num,i in enumerate(estimators):
    pred = i.predict(x_test)
    rocauc =  roc_auc_score(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    res = {"model": f"catboost_{num}_0","accuracy": accuracy, "rocauc":rocauc}
    result= result.append(res,ignore_index=True)
    if best_rocauc < rocauc:
        best_pred = num
        best_rocauc = rocauc
    print(classification_report(y_test, pred, labels=np.unique(pred)))
    print("roc_auc_score: ",rocauc)
    print("accuracy: ", accuracy)
    print("confusion_matrix: ",confusion_matrix(y_test, pred))

best_res = result.loc[result["rocauc"] == result["rocauc"].max()]
best_res['model'].replace(best_res.iloc[0][0], "best_" + best_res.iloc[0][0], inplace=True)
result= result.append(best_res, ignore_index=True)
print("best_pred: ", best_pred)
oof_best_model["best_" + best_res.iloc[0][0]] = estimators[best_pred].predict_proba(data.drop("cardio", axis=1))[:,1]

for num, i in enumerate(estimators):
    print("num: ",num)
    importance_column =i.get_feature_importance(prettified=True)
    print(importance_column[-1:])
    print("*" * 20)
result[-6:]

              precision    recall  f1-score   support

           0       0.72      0.77      0.74      6992
           1       0.75      0.70      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.732469364857946
accuracy:  0.7324285714285714
confusion_matrix:  [[5371 1621]
 [2125 4883]]
              precision    recall  f1-score   support

           0       0.72      0.77      0.74      6992
           1       0.75      0.70      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7319665070582949
accuracy:  0.7319285714285715
confusion_matrix:  [[5350 1642]
 [2111 4897]]
              precision    recall  f1-score   support

           0       0.72      0.76      0.74      6992
           1    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,model,rocauc,accuracy
0,catboost_0_0,0.732469,0.732429
1,catboost_1_0,0.731967,0.731929
2,catboost_2_0,0.73325,0.733214
3,catboost_3_0,0.733037,0.733
4,catboost_4_0,0.732689,0.732643
5,best_catboost_2_0,0.73325,0.733214


In [60]:
oof_best_model

Unnamed: 0,best_best_catboost_2_0
0,0.160621
1,0.869001
2,0.748102
3,0.891870
4,0.110501
...,...
69995,0.242193
69996,0.828334
69997,0.922852
69998,0.682755


In [61]:
data_test = X.copy()

In [62]:
data_test = data_test.drop("sex", axis=1)

In [63]:
estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=data_test, y=y, cv=cv
)

Mon Sep 11 03:13:14 2023, Cross-Validation, 56000 rows, 11 cols
Fold 1, Valid score = 0.80504
Fold 2, Valid score = 0.81305
Fold 3, Valid score = 0.80092
Fold 4, Valid score = 0.79731
Fold 5, Valid score = 0.8021
Score by each fold: [0.80504, 0.81305, 0.80092, 0.79731, 0.8021]


In [64]:
best_rocauc = 0
for num,i in enumerate(estimators):
    print(num)
    pred = i.predict(x_test)
    rocauc =  roc_auc_score(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    print(type(accuracy), accuracy)
    print("best_rocauc", best_rocauc)
    res = {"model": f"catboost_{num}_sex","accuracy": accuracy, "rocauc":rocauc}
    result= result.append(res,ignore_index=True)
    print(classification_report(y_test, pred, labels=np.unique(pred)))
    print("roc_auc_score: ",rocauc)
    print("accuracy: ", accuracy)
    print("confusion_matrix: ",confusion_matrix(y_test, pred))
best_res = result.loc[result["rocauc"] == result["rocauc"].max()]
best_res['model'].replace(best_res.iloc[0][0], "best_" + best_res.iloc[0][0], inplace=True)
result= result.append(best_res, ignore_index=True)
print("best_pred: ", best_pred)
if "best_" + best_res.iloc[0][0] in oof_best_model.columns:
    pass
else:
    oof_best_model["best_" + best_res.iloc[0][0]] = estimators[best_pred].predict_proba(data.drop("cardio", axis=1))[:,1]

for num, i in enumerate(estimators):
    print("num: ",num)
    importance_column =i.get_feature_importance(prettified=True)
    print(importance_column[-1:])
    print("*" * 20)
result[-6:]

0
<class 'numpy.float64'> 0.7322142857142857
best_rocauc 0
              precision    recall  f1-score   support

           0       0.72      0.77      0.74      6992
           1       0.75      0.69      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7322584258069235
accuracy:  0.7322142857142857
confusion_matrix:  [[5390 1602]
 [2147 4861]]
1
<class 'numpy.float64'> 0.7329285714285714
best_rocauc 0
              precision    recall  f1-score   support

           0       0.72      0.77      0.74      6992
           1       0.75      0.70      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7329684675506515
accuracy:  0.7329285714285714
confusion_matrix:  [[5369 1623]
 [2116 4892]]
2
<cla

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,model,rocauc,accuracy
7,catboost_1_sex,0.732968,0.732929
8,catboost_2_sex,0.731465,0.731429
9,catboost_3_sex,0.731959,0.731929
10,catboost_4_sex,0.732399,0.732357
11,best_catboost_2_0,0.73325,0.733214
12,best_catboost_2_0,0.73325,0.733214


In [65]:
oof_best_model

Unnamed: 0,best_best_catboost_2_0
0,0.160621
1,0.869001
2,0.748102
3,0.891870
4,0.110501
...,...
69995,0.242193
69996,0.828334
69997,0.922852
69998,0.682755


In [66]:
# data_test = X.copy()

In [67]:
data_test = data_test.drop("alco", axis=1)

In [68]:
estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=data_test, y=y, cv=cv
)

Mon Sep 11 03:15:37 2023, Cross-Validation, 56000 rows, 10 cols
Fold 1, Valid score = 0.80554
Fold 2, Valid score = 0.8127
Fold 3, Valid score = 0.80094
Fold 4, Valid score = 0.79579
Fold 5, Valid score = 0.80219
Score by each fold: [0.80554, 0.8127, 0.80094, 0.79579, 0.80219]


In [69]:
best_rocauc = 0
for num,i in enumerate(estimators):
    print(num)
    pred = i.predict(x_test)
    rocauc =  roc_auc_score(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    print(type(accuracy), accuracy)
    print("best_rocauc", best_rocauc)
    res = {"model": f"catboost_{num}_alco","accuracy": accuracy, "rocauc":rocauc}
    result= result.append(res,ignore_index=True)
    print(classification_report(y_test, pred, labels=np.unique(pred)))
    print("roc_auc_score: ",rocauc)
    print("accuracy: ", accuracy)
    print("confusion_matrix: ",confusion_matrix(y_test, pred))
best_res = result.loc[result["rocauc"] == result["rocauc"].max()]
best_res['model'].replace(best_res.iloc[0][0], "best_" + best_res.iloc[0][0], inplace=True)
result= result.append(best_res, ignore_index=True)
print("best_pred: ", best_pred)
if "best_" + best_res.iloc[0][0] in oof_best_model.columns:
    pass
else:
    oof_best_model["best_" + best_res.iloc[0][0]] = estimators[best_pred].predict_proba(data.drop("cardio", axis=1))[:,1]

for num, i in enumerate(estimators):
    print("num: ",num)
    importance_column =i.get_feature_importance(prettified=True)
    print(importance_column[-1:])
    print("*" * 20)
result[-6:]

0
<class 'numpy.float64'> 0.7313571428571428
best_rocauc 0
              precision    recall  f1-score   support

           0       0.72      0.76      0.74      6992
           1       0.74      0.71      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7313862614024639
accuracy:  0.7313571428571428
confusion_matrix:  [[5292 1700]
 [2061 4947]]
1
<class 'numpy.float64'> 0.7321428571428571
best_rocauc 0
              precision    recall  f1-score   support

           0       0.72      0.77      0.74      6992
           1       0.75      0.70      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7321823440748984
accuracy:  0.7321428571428571
confusion_matrix:  [[5361 1631]
 [2119 4889]]
2
<cla

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,model,rocauc,accuracy
13,catboost_0_alco,0.731386,0.731357
14,catboost_1_alco,0.732182,0.732143
15,catboost_2_alco,0.732176,0.732143
16,catboost_3_alco,0.733614,0.733571
17,catboost_4_alco,0.733467,0.733429
18,best_catboost_3_alco,0.733614,0.733571


In [70]:
for num, i in enumerate(estimators):
    print("num: ",num)
    importance_column =i.get_feature_importance(prettified=True)
    print(importance_column)
    print("*" * 20)

num:  0
    Feature Id  Importances
0        ap_hi    36.850917
1          age    16.104872
2  cholesterol    10.648523
3        ap_lo     9.890168
4       weight     7.450286
5       height     4.889265
6           id     4.301468
7         gluc     4.039662
8       active     3.234839
9        smoke     2.590001
********************
num:  1
    Feature Id  Importances
0        ap_hi    44.258152
1          age    15.776666
2  cholesterol    12.017481
3        ap_lo     9.781091
4       weight     5.689121
5         gluc     3.328583
6        smoke     2.546164
7       active     2.525830
8       height     2.084763
9           id     1.992149
********************
num:  2
    Feature Id  Importances
0        ap_hi    35.728282
1          age    15.984658
2        ap_lo    11.587500
3  cholesterol    11.042029
4       weight     7.252533
5       height     4.447806
6           id     4.276229
7         gluc     4.019274
8       active     2.858963
9        smoke     2.802726
**********

In [73]:
data_test = data_test.drop(["smoke", "id"], axis=1)

In [75]:
estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=data_test, y=y, cv=cv
)

Mon Sep 11 03:21:14 2023, Cross-Validation, 56000 rows, 8 cols
Fold 1, Valid score = 0.80413
Fold 2, Valid score = 0.81265
Fold 3, Valid score = 0.80001
Fold 4, Valid score = 0.79619
Fold 5, Valid score = 0.80169
Score by each fold: [0.80413, 0.81265, 0.80001, 0.79619, 0.80169]


In [76]:
best_rocauc = 0
for num,i in enumerate(estimators):
    print(num)
    pred = i.predict(x_test)
    rocauc =  roc_auc_score(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    print(type(accuracy), accuracy)
    print("best_rocauc", best_rocauc)
    res = {"model": f"catboost_{num}_smoke","accuracy": accuracy, "rocauc":rocauc}
    result= result.append(res,ignore_index=True)
    print(classification_report(y_test, pred, labels=np.unique(pred)))
    print("roc_auc_score: ",rocauc)
    print("accuracy: ", accuracy)
    print("confusion_matrix: ",confusion_matrix(y_test, pred))
best_res = result.loc[result["rocauc"] == result["rocauc"].max()]
best_res['model'].replace(best_res.iloc[0][0], "best_" + best_res.iloc[0][0], inplace=True)
result= result.append(best_res, ignore_index=True)
print("best_pred: ", best_pred)
if "best_" + best_res.iloc[0][0] in oof_best_model.columns:
    pass
else:
    oof_best_model["best_" + best_res.iloc[0][0]] = estimators[best_pred].predict_proba(data.drop("cardio", axis=1))[:,1]

for num, i in enumerate(estimators):
    print("num: ",num)
    importance_column =i.get_feature_importance(prettified=True)
    print(importance_column[-1:])
    print("*" * 20)
result[-6:]

0
<class 'numpy.float64'> 0.7328571428571429
best_rocauc 0
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      6992
           1       0.76      0.69      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.732906426653292
accuracy:  0.7328571428571429
confusion_matrix:  [[5426 1566]
 [2174 4834]]
1
<class 'numpy.float64'> 0.7330714285714286
best_rocauc 0
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      6992
           1       0.76      0.69      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7331222636698954
accuracy:  0.7330714285714286
confusion_matrix:  [[5437 1555]
 [2182 4826]]
2
<clas

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,model,rocauc,accuracy
19,catboost_0_smoke,0.732906,0.732857
20,catboost_1_smoke,0.733122,0.733071
21,catboost_2_smoke,0.733263,0.733214
22,catboost_3_smoke,0.734332,0.734286
23,catboost_4_smoke,0.732906,0.732857
24,best_catboost_3_smoke,0.734332,0.734286


In [77]:
data_test = data_test.drop(["active"], axis=1)

In [78]:
estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=data_test, y=y, cv=cv
)

Mon Sep 11 03:25:05 2023, Cross-Validation, 56000 rows, 7 cols
Fold 1, Valid score = 0.80399
Fold 2, Valid score = 0.811
Fold 3, Valid score = 0.79923
Fold 4, Valid score = 0.79522
Fold 5, Valid score = 0.79986
Score by each fold: [0.80399, 0.811, 0.79923, 0.79522, 0.79986]


In [79]:
best_rocauc = 0
for num,i in enumerate(estimators):
    print(num)
    pred = i.predict(x_test)
    rocauc =  roc_auc_score(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    print(type(accuracy), accuracy)
    print("best_rocauc", best_rocauc)
    res = {"model": f"catboost_{num}_active","accuracy": accuracy, "rocauc":rocauc}
    result= result.append(res,ignore_index=True)
    print(classification_report(y_test, pred, labels=np.unique(pred)))
    print("roc_auc_score: ",rocauc)
    print("accuracy: ", accuracy)
    print("confusion_matrix: ",confusion_matrix(y_test, pred))
best_res = result.loc[result["rocauc"] == result["rocauc"].max()]
best_res['model'].replace(best_res.iloc[0][0], "best_" + best_res.iloc[0][0], inplace=True)
result= result.append(best_res, ignore_index=True)
print("best_pred: ", best_pred)
if "best_" + best_res.iloc[0][0] in oof_best_model.columns:
    pass
else:
    oof_best_model["best_" + best_res.iloc[0][0]] = estimators[best_pred].predict_proba(data.drop("cardio", axis=1))[:,1]

for num, i in enumerate(estimators):
    print("num: ",num)
    importance_column =i.get_feature_importance(prettified=True)
    print(importance_column)
    print("*" * 20)
result[-6:]

0
<class 'numpy.float64'> 0.7314285714285714
best_rocauc 0
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      6992
           1       0.75      0.69      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7314794860140225
accuracy:  0.7314285714285714
confusion_matrix:  [[5426 1566]
 [2194 4814]]
1
<class 'numpy.float64'> 0.7316428571428572
best_rocauc 0
              precision    recall  f1-score   support

           0       0.71      0.77      0.74      6992
           1       0.75      0.69      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7316922209857581
accuracy:  0.7316428571428572
confusion_matrix:  [[5418 1574]
 [2183 4825]]
2
<cla

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,model,rocauc,accuracy
26,catboost_1_active,0.731692,0.731643
27,catboost_2_active,0.732481,0.732429
28,catboost_3_active,0.730835,0.730786
29,catboost_4_active,0.731267,0.731214
30,best_catboost_3_smoke,0.734332,0.734286
31,best_catboost_3_smoke,0.734332,0.734286


In [80]:
data_test = data_test.drop(["height"], axis=1)

In [81]:
estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=data_test, y=y, cv=cv
)

Mon Sep 11 03:33:35 2023, Cross-Validation, 56000 rows, 6 cols
Fold 1, Valid score = 0.80391
Fold 2, Valid score = 0.81094
Fold 3, Valid score = 0.79182
Fold 4, Valid score = 0.79515
Fold 5, Valid score = 0.79343
Score by each fold: [0.80391, 0.81094, 0.79182, 0.79515, 0.79343]


In [82]:
best_rocauc = 0
for num,i in enumerate(estimators):
    print(num)
    pred = i.predict(x_test)
    rocauc =  roc_auc_score(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    print(type(accuracy), accuracy)
    print("best_rocauc", best_rocauc)
    res = {"model": f"catboost_{num}_height","accuracy": accuracy, "rocauc":rocauc}
    result= result.append(res,ignore_index=True)
    print(classification_report(y_test, pred, labels=np.unique(pred)))
    print("roc_auc_score: ",rocauc)
    print("accuracy: ", accuracy)
    print("confusion_matrix: ",confusion_matrix(y_test, pred))
best_res = result.loc[result["rocauc"] == result["rocauc"].max()]
best_res['model'].replace(best_res.iloc[0][0], "best_" + best_res.iloc[0][0], inplace=True)
result= result.append(best_res, ignore_index=True)
print("best_pred: ", best_pred)
if "best_" + best_res.iloc[0][0] in oof_best_model.columns:
    pass
else:
    oof_best_model["best_" + best_res.iloc[0][0]] = estimators[best_pred].predict_proba(data.drop("cardio", axis=1))[:,1]

for num, i in enumerate(estimators):
    print("num: ",num)
    importance_column =i.get_feature_importance(prettified=True)
    print(importance_column)
    print("*" * 20)
result[-6:]

0
<class 'numpy.float64'> 0.7308571428571429
best_rocauc 0
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      6992
           1       0.76      0.68      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7309126281307796
accuracy:  0.7308571428571429
confusion_matrix:  [[5450 1542]
 [2226 4782]]
1
<class 'numpy.float64'> 0.7319285714285715
best_rocauc 0
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      6992
           1       0.76      0.69      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7319808744239993
accuracy:  0.7319285714285715
confusion_matrix:  [[5438 1554]
 [2199 4809]]
2
<cla

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,model,rocauc,accuracy
35,catboost_3_height,0.730404,0.730357
36,catboost_4_height,0.725369,0.725286
37,best_catboost_3_smoke,0.734332,0.734286
38,best_catboost_3_smoke,0.734332,0.734286
39,best_catboost_3_smoke,0.734332,0.734286
40,best_catboost_3_smoke,0.734332,0.734286


In [84]:
data_test = data_test.drop(["gluc"], axis=1)

In [85]:
estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=data_test, y=y, cv=cv
)

Mon Sep 11 03:39:29 2023, Cross-Validation, 56000 rows, 5 cols
Fold 1, Valid score = 0.79793
Fold 2, Valid score = 0.80946
Fold 3, Valid score = 0.7894
Fold 4, Valid score = 0.78733
Fold 5, Valid score = 0.79359
Score by each fold: [0.79793, 0.80946, 0.7894, 0.78733, 0.79359]


In [86]:
best_rocauc = 0
for num,i in enumerate(estimators):
    print(num)
    pred = i.predict(x_test)
    rocauc =  roc_auc_score(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    print(type(accuracy), accuracy)
    print("best_rocauc", best_rocauc)
    res = {"model": f"catboost_{num}_gluc","accuracy": accuracy, "rocauc":rocauc}
    result= result.append(res,ignore_index=True)
    print(classification_report(y_test, pred, labels=np.unique(pred)))
    print("roc_auc_score: ",rocauc)
    print("accuracy: ", accuracy)
    print("confusion_matrix: ",confusion_matrix(y_test, pred))
best_res = result.loc[result["rocauc"] == result["rocauc"].max()]
best_res['model'].replace(best_res.iloc[0][0], "best_" + best_res.iloc[0][0], inplace=True)
result= result.append(best_res, ignore_index=True)
print("best_pred: ", best_pred)
if "best_" + best_res.iloc[0][0] in oof_best_model.columns:
    pass
else:
    oof_best_model["best_" + best_res.iloc[0][0]] = estimators[best_pred].predict_proba(data.drop("cardio", axis=1))[:,1]

for num, i in enumerate(estimators):
    print("num: ",num)
    importance_column =i.get_feature_importance(prettified=True)
    print(importance_column)
    print("*" * 20)
result[-6:]

0
<class 'numpy.float64'> 0.7240714285714286
best_rocauc 0
              precision    recall  f1-score   support

           0       0.70      0.79      0.74      6992
           1       0.76      0.66      0.70      7008

    accuracy                           0.72     14000
   macro avg       0.73      0.72      0.72     14000
weighted avg       0.73      0.72      0.72     14000

roc_auc_score:  0.7241500070530704
accuracy:  0.7240714285714286
confusion_matrix:  [[5544 1448]
 [2415 4593]]
1
<class 'numpy.float64'> 0.7311428571428571
best_rocauc 0
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      6992
           1       0.76      0.68      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7311958938068817
accuracy:  0.7311428571428571
confusion_matrix:  [[5437 1555]
 [2209 4799]]
2
<cla

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,model,rocauc,accuracy
48,best_catboost_3_smoke,0.734332,0.734286
49,best_catboost_3_smoke,0.734332,0.734286
50,best_catboost_3_smoke,0.734332,0.734286
51,best_catboost_3_smoke,0.734332,0.734286
52,best_catboost_3_smoke,0.734332,0.734286
53,best_catboost_3_smoke,0.734332,0.734286


In [87]:
data_test = data_test.drop(["weight"], axis=1)

In [88]:
estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=data_test, y=y, cv=cv
)

Mon Sep 11 03:40:53 2023, Cross-Validation, 56000 rows, 4 cols
Fold 1, Valid score = 0.8016
Fold 2, Valid score = 0.80687
Fold 3, Valid score = 0.79411
Fold 4, Valid score = 0.7856
Fold 5, Valid score = 0.79727
Score by each fold: [0.8016, 0.80687, 0.79411, 0.7856, 0.79727]


In [89]:
best_rocauc = 0
for num,i in enumerate(estimators):
    print(num)
    pred = i.predict(x_test)
    rocauc =  roc_auc_score(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    print(type(accuracy), accuracy)
    print("best_rocauc", best_rocauc)
    res = {"model": f"catboost_{num}_weight","accuracy": accuracy, "rocauc":rocauc}
    result= result.append(res,ignore_index=True)
    print(classification_report(y_test, pred, labels=np.unique(pred)))
    print("roc_auc_score: ",rocauc)
    print("accuracy: ", accuracy)
    print("confusion_matrix: ",confusion_matrix(y_test, pred))
best_res = result.loc[result["rocauc"] == result["rocauc"].max()]
best_res['model'].replace(best_res.iloc[0][0], "best_" + best_res.iloc[0][0], inplace=True)
result= result.append(best_res, ignore_index=True)
print("best_pred: ", best_pred)
if "best_" + best_res.iloc[0][0] in oof_best_model.columns:
    pass
else:
    oof_best_model["best_" + best_res.iloc[0][0]] = estimators[best_pred].predict_proba(data.drop("cardio", axis=1))[:,1]

for num, i in enumerate(estimators):
    print("num: ",num)
    importance_column =i.get_feature_importance(prettified=True)
    print(importance_column)
    print("*" * 20)
result[-6:]

0
<class 'numpy.float64'> 0.7295714285714285
best_rocauc 0
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      6992
           1       0.76      0.68      0.72      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.729630504007189
accuracy:  0.7295714285714285
confusion_matrix:  [[5463 1529]
 [2257 4751]]
1
<class 'numpy.float64'> 0.7280714285714286
best_rocauc 0
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      6992
           1       0.75      0.68      0.71      7008

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000

roc_auc_score:  0.7281294408221267
accuracy:  0.7280714285714286
confusion_matrix:  [[5446 1546]
 [2261 4747]]
2
<clas

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,model,rocauc,accuracy
69,best_catboost_3_smoke,0.734332,0.734286
70,best_catboost_3_smoke,0.734332,0.734286
71,best_catboost_3_smoke,0.734332,0.734286
72,best_catboost_3_smoke,0.734332,0.734286
73,best_catboost_3_smoke,0.734332,0.734286
74,best_catboost_3_smoke,0.734332,0.734286


In [93]:
result[::]

Unnamed: 0,model,rocauc,accuracy
0,catboost_0_0,0.732469,0.732429
1,catboost_1_0,0.731967,0.731929
2,catboost_2_0,0.733250,0.733214
3,catboost_3_0,0.733037,0.733000
4,catboost_4_0,0.732689,0.732643
...,...,...,...
70,best_catboost_3_smoke,0.734332,0.734286
71,best_catboost_3_smoke,0.734332,0.734286
72,best_catboost_3_smoke,0.734332,0.734286
73,best_catboost_3_smoke,0.734332,0.734286


In [96]:
oof_best_model.to_csv("data/oof.csv", index=False)