In [5]:
import pandas as pd
import numpy as np
from phik.report import plot_correlation_matrix
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, \
    classification_report, roc_auc_score
import matplotlib.pyplot as plt
plt.style.use('dark_background')

In [6]:
df_raw = pd.read_csv('/kaggle/input/leopard-challenge-classification/train.csv')
test = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/test.csv')
df_raw.shape, test.shape

((13863, 26), (5942, 25))

In [7]:
test_cv = test.drop(['oral', 'ID'], axis = 1).copy()
test_cv['tartar'] = test_cv['tartar'].apply(lambda x: 1 if x == 'Y' else 0)
merged_data = df_raw.drop(['oral', 'ID'], axis=1).copy()
merged_data['tartar'] = merged_data['tartar'].apply(lambda x: 1 if x == 'Y' else 0)

In [8]:
# Создание бинарного признака на основе BMI
merged_data['BMI'] = merged_data['weight(kg)'] / ((merged_data['height(cm)'] / 100) ** 2)
merged_data['BMI_status'] = (merged_data['BMI'] < 18.5) | (merged_data['BMI'] >= 24.9)
merged_data['BMI_status'] = merged_data['BMI_status'].astype(int)
merged_data['log_tr'] = merged_data['triglyceride'].apply(np.log)
merged_data['log_gtp'] = merged_data['Gtp'].apply(np.log)
merged_data['log_log_alt'] = np.log(np.log(merged_data['ALT']))
merged_data['BMI_log_gtp'] = merged_data['BMI'].apply(np.log)
merged_data = merged_data.drop(['triglyceride', 'Gtp', 'ALT', 'BMI'], axis=1)

test_cv['BMI'] = test_cv['weight(kg)'] / ((test_cv['height(cm)'] / 100) ** 2)
test_cv['BMI_status'] = (test_cv['BMI'] < 18.5) | (test_cv['BMI'] >= 24.9)
test_cv['BMI_status'] = test_cv['BMI_status'].astype(int)
test_cv['log_tr'] = test_cv['triglyceride'].apply(np.log)
test_cv['log_gtp'] = test_cv['Gtp'].apply(np.log)
test_cv['log_log_alt'] = np.log(np.log(test_cv['ALT']))
test_cv['BMI_log_gtp'] = test_cv['BMI'].apply(np.log)
test_cv = test_cv.drop(['triglyceride', 'Gtp', 'ALT', 'BMI'], axis=1)

In [9]:
X = merged_data.drop('smoking', axis=1)
y = merged_data['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
target_variable = 'smoking'

cat_features = [ 'hearing(left)', 'hearing(right)', 'Urine protein', 'dental caries', 'tartar', 'BMI_status' ]
# cat_features = [ 'hearing(left)', 'hearing(right)', 'Urine protein', 'dental caries', 'tartar' ]
merged_data[cat_features] = merged_data[cat_features].astype(str)
non_cat_features = [col for col in merged_data.columns if col not in cat_features and col != target_variable]

In [11]:
from sklearn.model_selection import train_test_split

X = merged_data.drop(['smoking'], axis=1) 
y = merged_data['smoking']

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape

((11090, 25), (2773, 25))

In [12]:
from sklearn.preprocessing import MinMaxScaler

# Масштабирование признаков
scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)
X_train[non_cat_features] = scaler.fit_transform(X_train[non_cat_features])
X_test[non_cat_features] = scaler.transform(X_test[non_cat_features])

X_train.shape, X_test.shape

((11090, 25), (2773, 25))

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((8872, 25), (2218, 25), (8872,), (2218,))

In [12]:
import optuna
from sklearn.metrics import roc_auc_score, average_precision_score

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 700, 2500),
        'depth': trial.suggest_int('depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 5, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'od_type': 'Iter', # тип ранней остановки
        'od_wait': 50, # число итераций для ранней остановки
        'eval_metric': 'AUC',
        'logging_level': 'Silent',
#         'task_type': 'GPU',
        'auto_class_weights': 'Balanced'
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], use_best_model=True, cat_features=cat_features)
    
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_prec = average_precision_score(y_test, y_pred_proba)

    return roc_auc, avg_prec


study = optuna.create_study(directions=['maximize', 'maximize'])
study.optimize(objective, n_trials=150)

best_trial = study.best_trials[0]

roc_auc_best = best_trial.values[0]
avg_prec_best = best_trial.values[1]
print(f"Лучшее значение ROC AUC: {roc_auc_best:.4f}")
print(f"Лучшее значение Average Precision: {avg_prec_best:.4f}")
79/66

[I 2023-10-12 15:56:17,639] A new study created in memory with name: no-name-9c1c368c-73de-4446-bd96-29cf73fd4ee1
[I 2023-10-12 15:56:22,827] Trial 0 finished with values: [0.7275127493383255, 0.4152271426870075] and parameters: {'iterations': 2123, 'depth': 10, 'learning_rate': 0.03619856034293263, 'l2_leaf_reg': 7.374274616028394, 'border_count': 114}. 
[I 2023-10-12 15:56:25,331] Trial 1 finished with values: [0.7210541604802789, 0.4063398760710588] and parameters: {'iterations': 1800, 'depth': 7, 'learning_rate': 0.08528260059645565, 'l2_leaf_reg': 6.899403331592762, 'border_count': 133}. 
[I 2023-10-12 15:56:27,498] Trial 2 finished with values: [0.7237863920986379, 0.40814129845539704] and parameters: {'iterations': 1165, 'depth': 6, 'learning_rate': 0.06108139041240849, 'l2_leaf_reg': 9.571467073599845, 'border_count': 147}. 
[I 2023-10-12 15:56:29,613] Trial 3 finished with values: [0.7237533083726034, 0.4090949712829676] and parameters: {'iterations': 2132, 'depth': 5, 'learni

Лучшее значение ROC AUC: 0.7444
Лучшее значение Average Precision: 0.4468


1.196969696969697

In [14]:
from sklearn.model_selection import StratifiedKFold
import optuna
from sklearn.metrics import roc_auc_score, average_precision_score


def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 700, 2500),
        'depth': trial.suggest_int('depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 5, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'od_type': 'Iter', 
        'od_wait': 50, 
        'eval_metric': 'AUC',
        'logging_level': 'Silent',
        'auto_class_weights': 'Balanced'
    }
    
    roc_aucs = []
    avg_precs = []
    
    # Создание 5 фолдов
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_train_fold = X_train.iloc[train_idx]
        y_train_fold = y_train.iloc[train_idx]
        X_val_fold = X_train.iloc[val_idx]
        y_val_fold = y_train.iloc[val_idx]

        model = CatBoostClassifier(**params)
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], use_best_model=True, early_stopping_rounds=params['od_wait'], cat_features=cat_features)
        
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        avg_prec = average_precision_score(y_test, y_pred_proba)
        
        roc_aucs.append(roc_auc)
        avg_precs.append(avg_prec)
    
    # Возвращаем средние значения метрик по всем фолдам
    return np.mean(roc_aucs), np.mean(avg_precs)


In [15]:
study = optuna.create_study(directions=['maximize', 'maximize'])
study.optimize(objective, n_trials=150)

best_trial = study.best_trials[0]

roc_auc_best = best_trial.values[0]
avg_prec_best = best_trial.values[1]
print(f"Лучшее значение ROC AUC: {roc_auc_best:.4f}")
print(f"Лучшее значение Average Precision: {avg_prec_best:.4f}")

[I 2023-10-12 17:08:33,856] A new study created in memory with name: no-name-470daf75-d8a5-410c-b268-5d2a3859e763
[I 2023-10-12 17:10:36,621] Trial 0 finished with values: [0.7231376283003035, 0.4167514395255785] and parameters: {'iterations': 2119, 'depth': 12, 'learning_rate': 0.02027143285302816, 'l2_leaf_reg': 6.64810673265717, 'border_count': 234}. 
[I 2023-10-12 17:10:44,092] Trial 1 finished with values: [0.7169772771286553, 0.39854869495579426] and parameters: {'iterations': 727, 'depth': 5, 'learning_rate': 0.0878824038208824, 'l2_leaf_reg': 8.229032450357412, 'border_count': 255}. 
[I 2023-10-12 17:11:13,747] Trial 2 finished with values: [0.7097650248531404, 0.39075768217318907] and parameters: {'iterations': 1079, 'depth': 4, 'learning_rate': 0.010756285569876843, 'l2_leaf_reg': 5.60791613257778, 'border_count': 251}. 
[I 2023-10-12 17:11:25,713] Trial 3 finished with values: [0.719601381447292, 0.4026917374113027] and parameters: {'iterations': 2115, 'depth': 5, 'learning_

Лучшее значение ROC AUC: 0.7304
Лучшее значение Average Precision: 0.4224


In [16]:
from sklearn.metrics import f1_score

# Возьмите параметры из лучшего trial на основе первой метрики (ROC AUC в данном случае)
best_params = study.best_trials[0].params

model = CatBoostClassifier(**best_params, auto_class_weights='Balanced', logging_level='Silent') #  task_type='GPU' auto_class_weights='Balanced' , 
model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=100, cat_features=cat_features)

y_pred_proba = model.predict_proba(X_test)[:, 1]
# y_pred_proba = model.predict_proba(df[X.columns.to_list()])[:, 1]

thresholds = np.linspace(0.01, 1, 300)
f1_scores = [f1_score(y_test, y_pred_proba > thresh) for thresh in thresholds]
optimal_threshold = thresholds[np.argmax(f1_scores)]

print(f"Оптимальный порог: {optimal_threshold:.5f}")
print(f"Наивысший F1: {max(f1_scores):.5f}")
0.44375/0.45766

Оптимальный порог: 0.44375
Наивысший F1: 0.45766


0.9366147774610831

In [17]:
model.save_model("catboost_model_best_params.cbm")


In [18]:
import json

with open("best_parameters.json", "w") as f:
    json.dump(best_params, f)
