In [51]:
import pandas as pd
import numpy as np
from phik.report import plot_correlation_matrix
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, \
    classification_report, roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
plt.style.use('dark_background')

In [52]:
df_raw = pd.read_csv('/kaggle/input/leopard-challenge-classification/train.csv')
test = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/test.csv')
df_raw.shape, test.shape

((13863, 26), (5942, 25))

In [53]:
test_cv = test.drop(['oral', 'ID'], axis = 1).copy()
test_cv['tartar'] = test_cv['tartar'].apply(lambda x: 1 if x == 'Y' else 0)
merged_data = df_raw.drop(['oral', 'ID'], axis=1).copy()
merged_data['tartar'] = merged_data['tartar'].apply(lambda x: 1 if x == 'Y' else 0)

In [54]:
# Создание бинарного признака на основе BMI
merged_data['BMI'] = merged_data['weight(kg)'] / ((merged_data['height(cm)'] / 100) ** 2)
merged_data['BMI_status'] = (merged_data['BMI'] < 18.5) | (merged_data['BMI'] >= 24.9)
merged_data['BMI_status'] = merged_data['BMI_status'].astype(int)
merged_data['log_tr'] = merged_data['triglyceride'].apply(np.log)
merged_data['log_gtp'] = merged_data['Gtp'].apply(np.log)
merged_data['log_log_alt'] = np.log(np.log(merged_data['ALT']))
merged_data['BMI_log_gtp'] = merged_data['BMI'].apply(np.log)
merged_data = merged_data.drop(['triglyceride', 'Gtp', 'ALT', 'BMI'], axis=1)

test_cv['BMI'] = test_cv['weight(kg)'] / ((test_cv['height(cm)'] / 100) ** 2)
test_cv['BMI_status'] = (test_cv['BMI'] < 18.5) | (test_cv['BMI'] >= 24.9)
test_cv['BMI_status'] = test_cv['BMI_status'].astype(int)
test_cv['log_tr'] = test_cv['triglyceride'].apply(np.log)
test_cv['log_gtp'] = test_cv['Gtp'].apply(np.log)
test_cv['log_log_alt'] = np.log(np.log(test_cv['ALT']))
test_cv['BMI_log_gtp'] = test_cv['BMI'].apply(np.log)
test_cv = test_cv.drop(['triglyceride', 'Gtp', 'ALT', 'BMI'], axis=1)

In [55]:
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_recall_curve

In [56]:
from sklearn.model_selection import train_test_split


X = merged_data.drop(['smoking'], axis=1) 
Y = merged_data['smoking']

X.shape, Y.shape

((13863, 25), (13863,))

In [57]:
from sklearn.preprocessing import MinMaxScaler

# Масштабирование признаков
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
test_cv = scaler.transform(test_cv)
X.shape,  test_cv.shape

((13863, 25), (5942, 25))

In [58]:
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

# Инициализация StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((11090, 25), (2773, 25), (11090,), (2773,))

In [59]:
X_train = pd.DataFrame(X_train)

In [60]:
def objective(trial):
    # Гиперпараметры для оптимизации
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.15),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'num_leaves': trial.suggest_int('num_leaves', 2, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 3, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 10.0),
        'class_weight': 'balanced'
    }
    
    oof_preds = np.zeros(X_train.shape[0])

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        X_fold_train, y_fold_train = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_fold_val, y_fold_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

        # Создание и обучение модели с ранней остановкой
        lgbm_model = LGBMClassifier(**params)
        lgbm_model.fit(
            X_fold_train, y_fold_train,
            eval_set=[(X_fold_val, y_fold_val)],
            early_stopping_rounds=70,
            verbose=False
        )

        # Сохранение предсказаний
        oof_preds[val_idx] = lgbm_model.predict_proba(X_fold_val)[:, 1]

    # Вычисление ROC AUC на out-of-fold предсказаниях
    oof_roc_auc = roc_auc_score(y_train, oof_preds)

    return oof_roc_auc

In [61]:
%%time
# Запуск оптимизации
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

# Вывод лучших гиперпараметров
best_params = study.best_params
print(f"Best Params: {best_params}")

[I 2023-10-12 10:05:56,775] A new study created in memory with name: no-name-0dc1615d-9622-482e-939d-5e3ef79bb5ee




















[I 2023-10-12 10:06:10,855] Trial 0 finished with value: 0.6873629943502825 and parameters: {'n_estimators': 1958, 'learning_rate': 0.055710914759984555, 'max_depth': 10, 'num_leaves': 19, 'min_child_samples': 40, 'feature_fraction': 0.9817654855116916, 'bagging_fraction': 0.3823664555431716, 'bagging_freq': 4, 'lambda_l1': 0.6625695939911336, 'lambda_l2': 2.4955249143227185}. Best is trial 0 with value: 0.6873629943502825.




















[I 2023-10-12 10:06:57,971] Trial 1 finished with value: 0.6995163438256659 and parameters: {'n_estimators': 1157, 'learning_rate': 0.013336746778146988, 'max_depth': 12, 'num_leaves': 28, 'min_child_samples': 82, 'feature_fraction': 0.26131090364626386, 'bagging_fraction': 0.962419770476602, 'bagging_freq': 3, 'lambda_l1': 9.57716340947824, 'lambda_l2': 7.259314584204529}. Best is trial 1 with value: 0.6995163438256659.




















[I 2023-10-12 10:09:25,245] Trial 2 finished with value: 0.7065466101694915 and parameters: {'n_estimators': 2109, 'learning_rate': 0.01912368799501412, 'max_depth': 11, 'num_leaves': 90, 'min_child_samples': 8, 'feature_fraction': 0.6359736909429626, 'bagging_fraction': 0.4265533399132706, 'bagging_freq': 2, 'lambda_l1': 3.8476260070128565, 'lambda_l2': 2.465060612958123}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:10:00,719] Trial 3 finished with value: 0.6888047820823244 and parameters: {'n_estimators': 1511, 'learning_rate': 0.04229041198089479, 'max_depth': 11, 'num_leaves': 81, 'min_child_samples': 37, 'feature_fraction': 0.17409299911659434, 'bagging_fraction': 0.42308954550867917, 'bagging_freq': 5, 'lambda_l1': 4.992590503524797, 'lambda_l2': 2.2287539306752624}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:10:15,815] Trial 4 finished with value: 0.6814952078288943 and parameters: {'n_estimators': 1142, 'learning_rate': 0.14517279894733187, 'max_depth': 7, 'num_leaves': 31, 'min_child_samples': 50, 'feature_fraction': 0.580767353436274, 'bagging_fraction': 0.5819083603749851, 'bagging_freq': 6, 'lambda_l1': 4.930481081539472, 'lambda_l2': 1.5591437979472}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:12:19,483] Trial 5 finished with value: 0.6986609160613397 and parameters: {'n_estimators': 2860, 'learning_rate': 0.023717705336308525, 'max_depth': 8, 'num_leaves': 48, 'min_child_samples': 12, 'feature_fraction': 0.3364716866382102, 'bagging_fraction': 0.9156590365339049, 'bagging_freq': 1, 'lambda_l1': 3.0187985268622484, 'lambda_l2': 0.20035323669635008}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:13:17,235] Trial 6 finished with value: 0.699455609362389 and parameters: {'n_estimators': 2797, 'learning_rate': 0.01101333759325181, 'max_depth': 9, 'num_leaves': 29, 'min_child_samples': 83, 'feature_fraction': 0.9502598339978526, 'bagging_fraction': 0.5769376180336891, 'bagging_freq': 7, 'lambda_l1': 9.052104079155507, 'lambda_l2': 0.5292175878188266}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:14:03,106] Trial 7 finished with value: 0.7012210956416465 and parameters: {'n_estimators': 2267, 'learning_rate': 0.02056525143811002, 'max_depth': 6, 'num_leaves': 60, 'min_child_samples': 9, 'feature_fraction': 0.5319203263628085, 'bagging_fraction': 0.450011750853215, 'bagging_freq': 6, 'lambda_l1': 6.469942377180393, 'lambda_l2': 7.827075502597439}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:14:09,636] Trial 8 finished with value: 0.6818676856335755 and parameters: {'n_estimators': 2461, 'learning_rate': 0.12806020059368148, 'max_depth': 5, 'num_leaves': 70, 'min_child_samples': 29, 'feature_fraction': 0.5768165553017677, 'bagging_fraction': 0.3631507817559232, 'bagging_freq': 7, 'lambda_l1': 0.32140772482371016, 'lambda_l2': 1.7322100681314945}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:14:37,419] Trial 9 finished with value: 0.6921758474576271 and parameters: {'n_estimators': 2368, 'learning_rate': 0.08862315300997228, 'max_depth': 9, 'num_leaves': 78, 'min_child_samples': 62, 'feature_fraction': 0.9610838195761194, 'bagging_fraction': 0.5319410206082877, 'bagging_freq': 4, 'lambda_l1': 6.388947145817273, 'lambda_l2': 8.22523824718533}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:14:54,673] Trial 10 finished with value: 0.679861733252623 and parameters: {'n_estimators': 1799, 'learning_rate': 0.07129060419192908, 'max_depth': 12, 'num_leaves': 100, 'min_child_samples': 3, 'feature_fraction': 0.7348746120708328, 'bagging_fraction': 0.1018281242631276, 'bagging_freq': 1, 'lambda_l1': 2.671192561036216, 'lambda_l2': 4.352457119981357}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:15:15,745] Trial 11 finished with value: 0.6944579802259887 and parameters: {'n_estimators': 2310, 'learning_rate': 0.03575116823141304, 'max_depth': 5, 'num_leaves': 53, 'min_child_samples': 24, 'feature_fraction': 0.42520073461389535, 'bagging_fraction': 0.2845497330905319, 'bagging_freq': 2, 'lambda_l1': 6.954077383783414, 'lambda_l2': 9.685618340408}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:16:39,164] Trial 12 finished with value: 0.7023520984665053 and parameters: {'n_estimators': 2114, 'learning_rate': 0.00816514052433365, 'max_depth': 6, 'num_leaves': 99, 'min_child_samples': 20, 'feature_fraction': 0.681981813557053, 'bagging_fraction': 0.715170451416627, 'bagging_freq': 5, 'lambda_l1': 6.933356130392906, 'lambda_l2': 5.574597024303609}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:17:11,973] Trial 13 finished with value: 0.7018985573042776 and parameters: {'n_estimators': 1677, 'learning_rate': 0.007803289198854549, 'max_depth': 4, 'num_leaves': 100, 'min_child_samples': 19, 'feature_fraction': 0.7469877535093425, 'bagging_fraction': 0.741008448824843, 'bagging_freq': 3, 'lambda_l1': 8.081338987301809, 'lambda_l2': 4.619361899890027}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:18:13,733] Trial 14 finished with value: 0.6973418583535108 and parameters: {'n_estimators': 2076, 'learning_rate': 0.03709771647710346, 'max_depth': 7, 'num_leaves': 86, 'min_child_samples': 63, 'feature_fraction': 0.7220107812857419, 'bagging_fraction': 0.7379329504823648, 'bagging_freq': 3, 'lambda_l1': 4.124449641449265, 'lambda_l2': 3.647070165123494}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:19:20,908] Trial 15 finished with value: 0.6999171711057304 and parameters: {'n_estimators': 2606, 'learning_rate': 0.05318765903636001, 'max_depth': 10, 'num_leaves': 90, 'min_child_samples': 3, 'feature_fraction': 0.8329975538847978, 'bagging_fraction': 0.735989373325695, 'bagging_freq': 5, 'lambda_l1': 7.66140916535214, 'lambda_l2': 5.829831191796266}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:20:22,695] Trial 16 finished with value: 0.7025908494753833 and parameters: {'n_estimators': 2056, 'learning_rate': 0.005803288079927159, 'max_depth': 7, 'num_leaves': 68, 'min_child_samples': 100, 'feature_fraction': 0.6469927710093821, 'bagging_fraction': 0.6484807083230711, 'bagging_freq': 2, 'lambda_l1': 5.794170133839749, 'lambda_l2': 5.763077444647601}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:20:36,671] Trial 17 finished with value: 0.6968490718321226 and parameters: {'n_estimators': 1448, 'learning_rate': 0.028605648619913614, 'max_depth': 8, 'num_leaves': 66, 'min_child_samples': 91, 'feature_fraction': 0.45965396213770915, 'bagging_fraction': 0.2561588295195447, 'bagging_freq': 2, 'lambda_l1': 5.695990465294306, 'lambda_l2': 3.5264960272561674}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:21:37,565] Trial 18 finished with value: 0.7023322740112995 and parameters: {'n_estimators': 1861, 'learning_rate': 0.006159058896475004, 'max_depth': 10, 'num_leaves': 45, 'min_child_samples': 99, 'feature_fraction': 0.8550977524543053, 'bagging_fraction': 0.5205390940979073, 'bagging_freq': 2, 'lambda_l1': 3.9539938199041096, 'lambda_l2': 6.250613919594045}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:22:56,929] Trial 19 finished with value: 0.7011894168684424 and parameters: {'n_estimators': 2633, 'learning_rate': 0.025623546427473096, 'max_depth': 11, 'num_leaves': 71, 'min_child_samples': 70, 'feature_fraction': 0.6195956480158035, 'bagging_fraction': 0.6350186696095388, 'bagging_freq': 1, 'lambda_l1': 2.0214769659267477, 'lambda_l2': 3.5087419096809103}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:23:09,182] Trial 20 finished with value: 0.698093220338983 and parameters: {'n_estimators': 1633, 'learning_rate': 0.0482189105689744, 'max_depth': 7, 'num_leaves': 4, 'min_child_samples': 51, 'feature_fraction': 0.6432341296507832, 'bagging_fraction': 0.8887820264568138, 'bagging_freq': 2, 'lambda_l1': 5.545835824193687, 'lambda_l2': 5.1514436857492205}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:24:20,827] Trial 21 finished with value: 0.7027460653753027 and parameters: {'n_estimators': 2137, 'learning_rate': 0.006348432299346289, 'max_depth': 6, 'num_leaves': 90, 'min_child_samples': 33, 'feature_fraction': 0.6540385845917014, 'bagging_fraction': 0.6655536763766188, 'bagging_freq': 5, 'lambda_l1': 7.2831246736203745, 'lambda_l2': 5.771973548451422}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:25:26,413] Trial 22 finished with value: 0.6992540859564164 and parameters: {'n_estimators': 2150, 'learning_rate': 0.021069435754881537, 'max_depth': 6, 'num_leaves': 90, 'min_child_samples': 36, 'feature_fraction': 0.5142417136762975, 'bagging_fraction': 0.6386385184106408, 'bagging_freq': 3, 'lambda_l1': 8.461991775349734, 'lambda_l2': 6.538139547395481}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:26:33,745] Trial 23 finished with value: 0.7034099576271187 and parameters: {'n_estimators': 1936, 'learning_rate': 0.006361159128326759, 'max_depth': 8, 'num_leaves': 80, 'min_child_samples': 44, 'feature_fraction': 0.6533592787198448, 'bagging_fraction': 0.4950870199601057, 'bagging_freq': 4, 'lambda_l1': 7.5552729857619205, 'lambda_l2': 4.942669814470093}. Best is trial 2 with value: 0.7065466101694915.




















[I 2023-10-12 10:27:13,317] Trial 24 finished with value: 0.6948442796610169 and parameters: {'n_estimators': 1909, 'learning_rate': 0.029923259140904303, 'max_depth': 8, 'num_leaves': 80, 'min_child_samples': 45, 'feature_fraction': 0.793910174614334, 'bagging_fraction': 0.4591640846323099, 'bagging_freq': 6, 'lambda_l1': 7.738675934239081, 'lambda_l2': 4.654003313876887}. Best is trial 2 with value: 0.7065466101694915.


Best Params: {'n_estimators': 2109, 'learning_rate': 0.01912368799501412, 'max_depth': 11, 'num_leaves': 90, 'min_child_samples': 8, 'feature_fraction': 0.6359736909429626, 'bagging_fraction': 0.4265533399132706, 'bagging_freq': 2, 'lambda_l1': 3.8476260070128565, 'lambda_l2': 2.465060612958123}
CPU times: user 23min 52s, sys: 12min 42s, total: 36min 34s
Wall time: 21min 16s


In [62]:
# Обучение с лучшими гиперпараметрами
lgbm_model = LGBMClassifier(**best_params)
lgbm_model.fit(X_train, y_train)

# Предсказания на тестовой выборке
test_preds = lgbm_model.predict_proba(X_test)[:, 1]




In [65]:
lgbm_model.booster_.save_model('LGBM_Optuna_V_1_best_model_no_exta_features.txt')

# Загрузка модели
# import lightgbm as lgb

# loaded_model = lgb.Booster(model_file='best_model.txt')

import json

with open('LGBM_Optuna_V_1_best_params_no_exta_features.json', 'w') as f:
    json.dump(best_params, f)

# import json
# from lightgbm import LGBMClassifier

# # Загрузка сохраненных параметров
# with open('best_params.json', 'r') as f:
#     loaded_params = json.load(f)

# # Обучение модели с загруженными параметрами
# model_with_best_params = LGBMClassifier(**loaded_params)
# model_with_best_params.fit(X_train, y_train)

# # Предсказания с новой моделью
# predictions = model_with_best_params.predict(some_new_data)


In [63]:
# Определение оптимального порога
precision, recall, thresholds = precision_recall_curve(y_test, test_preds)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[f1_scores[:-1].argmax()]

print(f"Optimal Threshold: {optimal_threshold}")

Optimal Threshold: 0.2151395253285745


In [64]:
# Применение оптимального порога для классификации
y_pred_optimal = (test_preds > optimal_threshold).astype(int)

# Оценка качества модели с новым порогом на тестовой выборке
accuracy = accuracy_score(y_test, y_pred_optimal)
roc_auc = roc_auc_score(y_test, test_preds)
f1 = f1_score(y_test, y_pred_optimal)

print(f"Accuracy: {accuracy}")
print(f"ROC AUC: {roc_auc}")
print(f"F1 Score: {f1}")
0.4726

Accuracy: 0.7464839523981248
ROC AUC: 0.7356069653347105
F1 Score: 0.47261815453863465
