In [13]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, f1_score
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
plt.style.use('dark_background')

In [2]:
df_raw = pd.read_csv('/kaggle/input/leopard-challenge-classification/train.csv')
test = pd.read_csv(r'/kaggle/input/leopard-challenge-classification/test.csv')
df_raw.shape, test.shape

((13863, 26), (5942, 25))

In [3]:
test_cv = test.drop(['oral', 'ID'], axis = 1).copy()
test_cv['tartar'] = test_cv['tartar'].apply(lambda x: 1 if x == 'Y' else 0)
merged_data = df_raw.drop(['oral', 'ID'], axis=1).copy()
merged_data['tartar'] = merged_data['tartar'].apply(lambda x: 1 if x == 'Y' else 0)

In [4]:
# Создание бинарного признака на основе BMI
merged_data['BMI'] = merged_data['weight(kg)'] / ((merged_data['height(cm)'] / 100) ** 2)
merged_data['BMI_status'] = (merged_data['BMI'] < 18.5) | (merged_data['BMI'] >= 24.9)
merged_data['BMI_status'] = merged_data['BMI_status'].astype(int)
merged_data['log_tr'] = merged_data['triglyceride'].apply(np.log)
merged_data['log_gtp'] = merged_data['Gtp'].apply(np.log)
merged_data['log_log_alt'] = np.log(np.log(merged_data['ALT']))
merged_data['BMI_log_gtp'] = merged_data['BMI'].apply(np.log)
merged_data = merged_data.drop(['triglyceride', 'Gtp', 'ALT', 'BMI'], axis=1)

test_cv['BMI'] = test_cv['weight(kg)'] / ((test_cv['height(cm)'] / 100) ** 2)
test_cv['BMI_status'] = (test_cv['BMI'] < 18.5) | (test_cv['BMI'] >= 24.9)
test_cv['BMI_status'] = test_cv['BMI_status'].astype(int)
test_cv['log_tr'] = test_cv['triglyceride'].apply(np.log)
test_cv['log_gtp'] = test_cv['Gtp'].apply(np.log)
test_cv['log_log_alt'] = np.log(np.log(test_cv['ALT']))
test_cv['BMI_log_gtp'] = test_cv['BMI'].apply(np.log)
test_cv = test_cv.drop(['triglyceride', 'Gtp', 'ALT', 'BMI'], axis=1)

In [6]:
X = merged_data.drop('smoking', axis=1)
y = merged_data['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [20]:
import optuna

def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.15),
        'n_estimators': trial.suggest_int('n_estimators', 800, 2500),
        'max_depth': trial.suggest_int('max_depth', 3, 11),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'scale_pos_weight': 1/y_train.mean(),
        'early_stopping_rounds': 70,
        'eval_metric': "auc"
    }
    
    f1_scores = []
    
    for train_index, val_index in kf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        model = XGBClassifier(**params)
        eval_set = [(X_val_fold, y_val_fold)]
        model.fit(X_train_fold, y_train_fold, eval_set=eval_set, verbose=100) # , early_stopping_rounds=50

        y_prob_fold = model.predict_proba(X_val_fold)[:, 1]
        thresholds = np.linspace(0, 1, 300)
        fold_f1_scores = [f1_score(y_val_fold, y_prob_fold > thresh) for thresh in thresholds]
        f1_scores.append(max(fold_f1_scores))
    
    return -np.mean(f1_scores)  # Среднее значение F1 по всем фолдам

# Запуск оптимизации Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=25)  

# Лучшие параметры
best_params = study.best_params

print("Best parameters:", best_params)
print("Best F1 score:", -study.best_value)


[I 2023-10-12 15:08:37,000] A new study created in memory with name: no-name-d62394bd-36d1-4fe6-8584-10464161d096


[0]	validation_0-auc:0.56958
[100]	validation_0-auc:0.66614
[200]	validation_0-auc:0.67344
[264]	validation_0-auc:0.67391
[0]	validation_0-auc:0.60569
[100]	validation_0-auc:0.70013
[115]	validation_0-auc:0.69749
[0]	validation_0-auc:0.62321
[100]	validation_0-auc:0.70597
[200]	validation_0-auc:0.71262
[249]	validation_0-auc:0.71439
[0]	validation_0-auc:0.59120
[100]	validation_0-auc:0.68463
[200]	validation_0-auc:0.70051
[300]	validation_0-auc:0.70127
[335]	validation_0-auc:0.70367
[0]	validation_0-auc:0.61434
[100]	validation_0-auc:0.70012
[165]	validation_0-auc:0.70108


[I 2023-10-12 15:09:01,105] Trial 0 finished with value: -0.43543834672647935 and parameters: {'learning_rate': 0.13213251588820324, 'n_estimators': 2108, 'max_depth': 10, 'subsample': 0.5152078440520851, 'colsample_bytree': 0.7294807802672215, 'gamma': 0.0885468423472584}. Best is trial 0 with value: -0.43543834672647935.


[0]	validation_0-auc:0.61412
[100]	validation_0-auc:0.67456
[200]	validation_0-auc:0.67611
[300]	validation_0-auc:0.67920
[400]	validation_0-auc:0.68363
[500]	validation_0-auc:0.68558
[600]	validation_0-auc:0.68728
[689]	validation_0-auc:0.68664
[0]	validation_0-auc:0.59322
[100]	validation_0-auc:0.70408
[200]	validation_0-auc:0.71303
[269]	validation_0-auc:0.71233
[0]	validation_0-auc:0.59717
[100]	validation_0-auc:0.71513
[200]	validation_0-auc:0.72121
[300]	validation_0-auc:0.72378
[400]	validation_0-auc:0.72681
[500]	validation_0-auc:0.72858
[600]	validation_0-auc:0.72965
[641]	validation_0-auc:0.72923
[0]	validation_0-auc:0.58400
[100]	validation_0-auc:0.69961
[200]	validation_0-auc:0.70922
[300]	validation_0-auc:0.71398
[400]	validation_0-auc:0.71842
[494]	validation_0-auc:0.71804
[0]	validation_0-auc:0.59100
[100]	validation_0-auc:0.71373
[200]	validation_0-auc:0.72101
[300]	validation_0-auc:0.72455
[380]	validation_0-auc:0.72246


[I 2023-10-12 15:10:00,777] Trial 1 finished with value: -0.44987852734499273 and parameters: {'learning_rate': 0.04142494487977635, 'n_estimators': 1653, 'max_depth': 11, 'subsample': 0.5628148240547551, 'colsample_bytree': 0.9884625987373434, 'gamma': 0.22355658732139855}. Best is trial 1 with value: -0.44987852734499273.


[0]	validation_0-auc:0.57849
[100]	validation_0-auc:0.66004
[200]	validation_0-auc:0.67248
[300]	validation_0-auc:0.67624
[400]	validation_0-auc:0.67850
[500]	validation_0-auc:0.68061
[600]	validation_0-auc:0.68244
[686]	validation_0-auc:0.68292
[0]	validation_0-auc:0.60662
[100]	validation_0-auc:0.70278
[118]	validation_0-auc:0.70355
[0]	validation_0-auc:0.63616
[100]	validation_0-auc:0.71059
[117]	validation_0-auc:0.70894
[0]	validation_0-auc:0.60598
[100]	validation_0-auc:0.68960
[131]	validation_0-auc:0.69217
[0]	validation_0-auc:0.62625
[100]	validation_0-auc:0.70326
[177]	validation_0-auc:0.70297


[I 2023-10-12 15:10:19,955] Trial 2 finished with value: -0.441413376064734 and parameters: {'learning_rate': 0.08081503864417493, 'n_estimators': 1205, 'max_depth': 8, 'subsample': 0.8407110999450726, 'colsample_bytree': 0.7994245372065556, 'gamma': 0.14762338816312348}. Best is trial 1 with value: -0.44987852734499273.


[0]	validation_0-auc:0.59776
[100]	validation_0-auc:0.66540
[200]	validation_0-auc:0.67461
[300]	validation_0-auc:0.67801
[396]	validation_0-auc:0.67759
[0]	validation_0-auc:0.60572
[100]	validation_0-auc:0.71019
[200]	validation_0-auc:0.71498
[226]	validation_0-auc:0.71310
[0]	validation_0-auc:0.61538
[100]	validation_0-auc:0.72709
[200]	validation_0-auc:0.72825
[226]	validation_0-auc:0.72861
[0]	validation_0-auc:0.59449
[100]	validation_0-auc:0.69904
[200]	validation_0-auc:0.70873
[300]	validation_0-auc:0.71035
[310]	validation_0-auc:0.71107
[0]	validation_0-auc:0.62257
[100]	validation_0-auc:0.73404
[180]	validation_0-auc:0.73103


[I 2023-10-12 15:10:45,606] Trial 3 finished with value: -0.45205954235751855 and parameters: {'learning_rate': 0.116329129235043, 'n_estimators': 1307, 'max_depth': 10, 'subsample': 0.7987264179358762, 'colsample_bytree': 0.6528195620089938, 'gamma': 0.4154236021302938}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.59145
[100]	validation_0-auc:0.65758
[200]	validation_0-auc:0.67203
[300]	validation_0-auc:0.67426
[400]	validation_0-auc:0.67462
[500]	validation_0-auc:0.67529
[553]	validation_0-auc:0.67463
[0]	validation_0-auc:0.59017
[100]	validation_0-auc:0.69633
[0]	validation_0-auc:0.59427
[100]	validation_0-auc:0.72282
[200]	validation_0-auc:0.72586
[233]	validation_0-auc:0.72537
[0]	validation_0-auc:0.58167
[100]	validation_0-auc:0.69231
[200]	validation_0-auc:0.70291
[300]	validation_0-auc:0.70509
[377]	validation_0-auc:0.70418
[0]	validation_0-auc:0.58175
[100]	validation_0-auc:0.71073
[178]	validation_0-auc:0.71206


[I 2023-10-12 15:11:13,314] Trial 4 finished with value: -0.4462413751702933 and parameters: {'learning_rate': 0.14301132827394303, 'n_estimators': 1990, 'max_depth': 10, 'subsample': 0.6625562923850108, 'colsample_bytree': 0.7106939415080415, 'gamma': 0.4473193509750688}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.62017
[100]	validation_0-auc:0.65674
[103]	validation_0-auc:0.65771
[0]	validation_0-auc:0.62807
[100]	validation_0-auc:0.68872
[103]	validation_0-auc:0.68971
[0]	validation_0-auc:0.63129
[100]	validation_0-auc:0.69355
[146]	validation_0-auc:0.69198
[0]	validation_0-auc:0.62472
[100]	validation_0-auc:0.69035
[145]	validation_0-auc:0.69097
[0]	validation_0-auc:0.64683
[100]	validation_0-auc:0.69562
[124]	validation_0-auc:0.69132


[I 2023-10-12 15:11:21,092] Trial 5 finished with value: -0.42739003317799124 and parameters: {'learning_rate': 0.14543713176514583, 'n_estimators': 1885, 'max_depth': 5, 'subsample': 0.8815628641633093, 'colsample_bytree': 0.7918461407018026, 'gamma': 0.11938086774018752}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.57047
[100]	validation_0-auc:0.65164
[200]	validation_0-auc:0.66480
[300]	validation_0-auc:0.66966
[400]	validation_0-auc:0.67519
[500]	validation_0-auc:0.67774
[571]	validation_0-auc:0.67669
[0]	validation_0-auc:0.59786
[100]	validation_0-auc:0.70280
[200]	validation_0-auc:0.71008
[300]	validation_0-auc:0.71114
[313]	validation_0-auc:0.71190
[0]	validation_0-auc:0.60535
[100]	validation_0-auc:0.71594
[103]	validation_0-auc:0.71522
[0]	validation_0-auc:0.61454
[100]	validation_0-auc:0.68629
[121]	validation_0-auc:0.68576
[0]	validation_0-auc:0.60624
[100]	validation_0-auc:0.70144
[177]	validation_0-auc:0.69634


[I 2023-10-12 15:11:44,006] Trial 6 finished with value: -0.43786129928882733 and parameters: {'learning_rate': 0.11421473915309151, 'n_estimators': 2120, 'max_depth': 8, 'subsample': 0.7512750966524716, 'colsample_bytree': 0.9402502771646808, 'gamma': 0.026804732718884228}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.61606
[100]	validation_0-auc:0.66558
[116]	validation_0-auc:0.66514
[0]	validation_0-auc:0.63107
[100]	validation_0-auc:0.69811
[135]	validation_0-auc:0.69602
[0]	validation_0-auc:0.63046
[100]	validation_0-auc:0.71879
[159]	validation_0-auc:0.71615
[0]	validation_0-auc:0.60237
[100]	validation_0-auc:0.69432
[156]	validation_0-auc:0.69381
[0]	validation_0-auc:0.61235
[100]	validation_0-auc:0.70105
[198]	validation_0-auc:0.70123


[I 2023-10-12 15:11:54,762] Trial 7 finished with value: -0.433312877142356 and parameters: {'learning_rate': 0.06928327460262962, 'n_estimators': 2028, 'max_depth': 6, 'subsample': 0.9510531719104854, 'colsample_bytree': 0.9818771897633806, 'gamma': 0.20275631561049628}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.61566
[88]	validation_0-auc:0.65276
[0]	validation_0-auc:0.63110
[100]	validation_0-auc:0.69873
[107]	validation_0-auc:0.69680
[0]	validation_0-auc:0.60802
[100]	validation_0-auc:0.69865
[126]	validation_0-auc:0.69693
[0]	validation_0-auc:0.61579
[100]	validation_0-auc:0.68037
[0]	validation_0-auc:0.61527
[98]	validation_0-auc:0.69218


[I 2023-10-12 15:12:03,270] Trial 8 finished with value: -0.4241654343572094 and parameters: {'learning_rate': 0.09770052658766644, 'n_estimators': 1015, 'max_depth': 6, 'subsample': 0.8155784376603934, 'colsample_bytree': 0.5206302827387235, 'gamma': 0.41253642074507996}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.57389
[100]	validation_0-auc:0.65812
[200]	validation_0-auc:0.66676
[300]	validation_0-auc:0.67099
[353]	validation_0-auc:0.67056
[0]	validation_0-auc:0.59498
[100]	validation_0-auc:0.70894
[200]	validation_0-auc:0.71307
[300]	validation_0-auc:0.71358
[400]	validation_0-auc:0.71597
[465]	validation_0-auc:0.71530
[0]	validation_0-auc:0.58639
[100]	validation_0-auc:0.71004
[200]	validation_0-auc:0.71992
[300]	validation_0-auc:0.72122
[400]	validation_0-auc:0.72362
[500]	validation_0-auc:0.72373
[529]	validation_0-auc:0.72375
[0]	validation_0-auc:0.59481
[100]	validation_0-auc:0.69392
[200]	validation_0-auc:0.70461
[244]	validation_0-auc:0.70437
[0]	validation_0-auc:0.60765
[100]	validation_0-auc:0.71406
[200]	validation_0-auc:0.72137
[300]	validation_0-auc:0.71987
[325]	validation_0-auc:0.71932


[I 2023-10-12 15:12:33,053] Trial 9 finished with value: -0.4432866185513489 and parameters: {'learning_rate': 0.12064405853197577, 'n_estimators': 1490, 'max_depth': 10, 'subsample': 0.7174506014917523, 'colsample_bytree': 0.5003744539580387, 'gamma': 0.47665101152939093}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.60978
[100]	validation_0-auc:0.67606
[131]	validation_0-auc:0.67404
[0]	validation_0-auc:0.62515
[100]	validation_0-auc:0.69949
[173]	validation_0-auc:0.69618
[0]	validation_0-auc:0.61767
[100]	validation_0-auc:0.69860
[200]	validation_0-auc:0.70406
[255]	validation_0-auc:0.70348
[0]	validation_0-auc:0.62278
[100]	validation_0-auc:0.69037
[200]	validation_0-auc:0.69157
[239]	validation_0-auc:0.69292
[0]	validation_0-auc:0.62221
[100]	validation_0-auc:0.69636
[168]	validation_0-auc:0.68924


[I 2023-10-12 15:12:40,369] Trial 10 finished with value: -0.4255145729686783 and parameters: {'learning_rate': 0.11106283901957457, 'n_estimators': 2430, 'max_depth': 3, 'subsample': 0.9750285360497621, 'colsample_bytree': 0.600394048053532, 'gamma': 0.3420335466917618}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.58277
[100]	validation_0-auc:0.66597
[200]	validation_0-auc:0.67355
[300]	validation_0-auc:0.67936
[400]	validation_0-auc:0.68348
[500]	validation_0-auc:0.68511
[548]	validation_0-auc:0.68575
[0]	validation_0-auc:0.58839
[100]	validation_0-auc:0.70995
[200]	validation_0-auc:0.72017
[300]	validation_0-auc:0.72282
[400]	validation_0-auc:0.72476
[446]	validation_0-auc:0.72399
[0]	validation_0-auc:0.60070
[100]	validation_0-auc:0.71677
[200]	validation_0-auc:0.71832
[237]	validation_0-auc:0.71874
[0]	validation_0-auc:0.58627
[100]	validation_0-auc:0.70193
[182]	validation_0-auc:0.70370
[0]	validation_0-auc:0.61213
[100]	validation_0-auc:0.71219
[113]	validation_0-auc:0.71367


[I 2023-10-12 15:13:18,959] Trial 11 finished with value: -0.44405615966844286 and parameters: {'learning_rate': 0.033909085338249066, 'n_estimators': 1479, 'max_depth': 11, 'subsample': 0.5806373823749877, 'colsample_bytree': 0.9003938457254936, 'gamma': 0.2872697426319036}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.61353
[100]	validation_0-auc:0.66874
[200]	validation_0-auc:0.67544
[300]	validation_0-auc:0.67974
[400]	validation_0-auc:0.68541
[500]	validation_0-auc:0.68737
[600]	validation_0-auc:0.68983
[684]	validation_0-auc:0.69016
[0]	validation_0-auc:0.59174
[100]	validation_0-auc:0.71031
[200]	validation_0-auc:0.71663
[300]	validation_0-auc:0.71956
[400]	validation_0-auc:0.72260
[500]	validation_0-auc:0.72186
[600]	validation_0-auc:0.72265
[632]	validation_0-auc:0.72165
[0]	validation_0-auc:0.55066
[100]	validation_0-auc:0.71673
[200]	validation_0-auc:0.72390
[300]	validation_0-auc:0.72524
[328]	validation_0-auc:0.72581
[0]	validation_0-auc:0.61498
[100]	validation_0-auc:0.70129
[147]	validation_0-auc:0.70149
[0]	validation_0-auc:0.59617
[100]	validation_0-auc:0.71552
[200]	validation_0-auc:0.72121
[300]	validation_0-auc:0.72459
[400]	validation_0-auc:0.72652
[476]	validation_0-auc:0.72602


[I 2023-10-12 15:14:16,950] Trial 12 finished with value: -0.45012715341421244 and parameters: {'learning_rate': 0.03533979913746623, 'n_estimators': 840, 'max_depth': 11, 'subsample': 0.6349316960174984, 'colsample_bytree': 0.9986372372555316, 'gamma': 0.26666057700709667}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.62989
[100]	validation_0-auc:0.67150
[200]	validation_0-auc:0.67286
[221]	validation_0-auc:0.67612
[0]	validation_0-auc:0.61584
[100]	validation_0-auc:0.70420
[200]	validation_0-auc:0.70771
[221]	validation_0-auc:0.70801
[0]	validation_0-auc:0.58447
[100]	validation_0-auc:0.71300
[200]	validation_0-auc:0.71983
[212]	validation_0-auc:0.71978
[0]	validation_0-auc:0.59341
[100]	validation_0-auc:0.68728
[175]	validation_0-auc:0.68720
[0]	validation_0-auc:0.60254
[100]	validation_0-auc:0.71117
[200]	validation_0-auc:0.71920
[300]	validation_0-auc:0.72253
[400]	validation_0-auc:0.72739
[430]	validation_0-auc:0.72633


[I 2023-10-12 15:14:43,053] Trial 13 finished with value: -0.4411984602048372 and parameters: {'learning_rate': 0.05522339800905324, 'n_estimators': 842, 'max_depth': 9, 'subsample': 0.6632022463834928, 'colsample_bytree': 0.8667282135279746, 'gamma': 0.35795100595102103}. Best is trial 3 with value: -0.45205954235751855.


[0]	validation_0-auc:0.59084
[100]	validation_0-auc:0.69316
[200]	validation_0-auc:0.69548
[210]	validation_0-auc:0.69680
[0]	validation_0-auc:0.59659
[100]	validation_0-auc:0.71101
[200]	validation_0-auc:0.72052
[300]	validation_0-auc:0.72481
[327]	validation_0-auc:0.72421
[0]	validation_0-auc:0.59958
[100]	validation_0-auc:0.72653
[200]	validation_0-auc:0.74013
[300]	validation_0-auc:0.74347
[352]	validation_0-auc:0.74285
[0]	validation_0-auc:0.60363
[100]	validation_0-auc:0.69895
[200]	validation_0-auc:0.70622
[300]	validation_0-auc:0.71206
[400]	validation_0-auc:0.71505
[472]	validation_0-auc:0.71411
[0]	validation_0-auc:0.58543
[100]	validation_0-auc:0.71817
[200]	validation_0-auc:0.72358
[262]	validation_0-auc:0.72391


[I 2023-10-12 15:15:15,691] Trial 14 finished with value: -0.4573087610687489 and parameters: {'learning_rate': 0.09101000886909859, 'n_estimators': 1199, 'max_depth': 11, 'subsample': 0.7635921298553625, 'colsample_bytree': 0.6682734311264994, 'gamma': 0.2921663159248171}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.59399
[100]	validation_0-auc:0.66291
[105]	validation_0-auc:0.66393
[0]	validation_0-auc:0.60758
[100]	validation_0-auc:0.70823
[200]	validation_0-auc:0.71516
[300]	validation_0-auc:0.71895
[380]	validation_0-auc:0.71937
[0]	validation_0-auc:0.61043
[100]	validation_0-auc:0.70894
[200]	validation_0-auc:0.71334
[288]	validation_0-auc:0.71366
[0]	validation_0-auc:0.60161
[100]	validation_0-auc:0.69382
[107]	validation_0-auc:0.69487
[0]	validation_0-auc:0.60021
[100]	validation_0-auc:0.70845
[174]	validation_0-auc:0.70405


[I 2023-10-12 15:15:33,958] Trial 15 finished with value: -0.4357100896497027 and parameters: {'learning_rate': 0.09535189908297713, 'n_estimators': 1231, 'max_depth': 9, 'subsample': 0.7739499882900053, 'colsample_bytree': 0.6450598365235344, 'gamma': 0.4917728426373206}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.60771
[100]	validation_0-auc:0.67638
[200]	validation_0-auc:0.68084
[203]	validation_0-auc:0.68096
[0]	validation_0-auc:0.60694
[100]	validation_0-auc:0.70399
[147]	validation_0-auc:0.70763
[0]	validation_0-auc:0.62428
[100]	validation_0-auc:0.71300
[200]	validation_0-auc:0.71514
[255]	validation_0-auc:0.71380
[0]	validation_0-auc:0.58334
[100]	validation_0-auc:0.69185
[147]	validation_0-auc:0.69497
[0]	validation_0-auc:0.62547
[100]	validation_0-auc:0.71485
[200]	validation_0-auc:0.71218
[231]	validation_0-auc:0.71354


[I 2023-10-12 15:15:50,018] Trial 16 finished with value: -0.44459870469736884 and parameters: {'learning_rate': 0.08075444918750248, 'n_estimators': 1202, 'max_depth': 8, 'subsample': 0.87694590860171, 'colsample_bytree': 0.6721656029951536, 'gamma': 0.38776767876378615}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.57636
[100]	validation_0-auc:0.66277
[200]	validation_0-auc:0.66381
[300]	validation_0-auc:0.67232
[400]	validation_0-auc:0.67435
[417]	validation_0-auc:0.67407
[0]	validation_0-auc:0.58762
[100]	validation_0-auc:0.69392
[200]	validation_0-auc:0.70073
[300]	validation_0-auc:0.70634
[400]	validation_0-auc:0.70940
[500]	validation_0-auc:0.70904
[600]	validation_0-auc:0.71036
[640]	validation_0-auc:0.70967
[0]	validation_0-auc:0.60643
[100]	validation_0-auc:0.69657
[200]	validation_0-auc:0.71471
[300]	validation_0-auc:0.71761
[344]	validation_0-auc:0.71668
[0]	validation_0-auc:0.57593
[100]	validation_0-auc:0.69637
[200]	validation_0-auc:0.70331
[300]	validation_0-auc:0.70472
[400]	validation_0-auc:0.70584
[479]	validation_0-auc:0.70578
[0]	validation_0-auc:0.60055
[100]	validation_0-auc:0.69585
[200]	validation_0-auc:0.70846
[300]	validation_0-auc:0.71278
[320]	validation_0-auc:0.71309


[I 2023-10-12 15:16:21,766] Trial 17 finished with value: -0.4479509798691093 and parameters: {'learning_rate': 0.10205225340268301, 'n_estimators': 1405, 'max_depth': 9, 'subsample': 0.7137626766043687, 'colsample_bytree': 0.5935387617977625, 'gamma': 0.3122939803884027}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.60876
[100]	validation_0-auc:0.66811
[109]	validation_0-auc:0.66575
[0]	validation_0-auc:0.61233
[100]	validation_0-auc:0.69651
[200]	validation_0-auc:0.69748
[260]	validation_0-auc:0.69626
[0]	validation_0-auc:0.60545
[100]	validation_0-auc:0.69778
[200]	validation_0-auc:0.69932
[220]	validation_0-auc:0.69926
[0]	validation_0-auc:0.61110
[100]	validation_0-auc:0.69822
[123]	validation_0-auc:0.69734
[0]	validation_0-auc:0.62295
[100]	validation_0-auc:0.69714
[116]	validation_0-auc:0.69627


[I 2023-10-12 15:16:29,014] Trial 18 finished with value: -0.42752421583843175 and parameters: {'learning_rate': 0.12656482512864414, 'n_estimators': 1773, 'max_depth': 3, 'subsample': 0.7910599107392343, 'colsample_bytree': 0.766837565768138, 'gamma': 0.4100189135525934}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.60204
[100]	validation_0-auc:0.65921
[200]	validation_0-auc:0.66312
[290]	validation_0-auc:0.66358
[0]	validation_0-auc:0.62585
[100]	validation_0-auc:0.69313
[200]	validation_0-auc:0.69998
[300]	validation_0-auc:0.70496
[400]	validation_0-auc:0.70931
[500]	validation_0-auc:0.71004
[520]	validation_0-auc:0.71030
[0]	validation_0-auc:0.60267
[100]	validation_0-auc:0.69848
[200]	validation_0-auc:0.70180
[247]	validation_0-auc:0.70428
[0]	validation_0-auc:0.60566
[100]	validation_0-auc:0.68517
[188]	validation_0-auc:0.68636
[0]	validation_0-auc:0.60085
[100]	validation_0-auc:0.68541
[119]	validation_0-auc:0.68542


[I 2023-10-12 15:16:48,312] Trial 19 finished with value: -0.42612537757255237 and parameters: {'learning_rate': 0.1092082632346097, 'n_estimators': 1036, 'max_depth': 7, 'subsample': 0.7297135586105575, 'colsample_bytree': 0.6833049641209487, 'gamma': 0.32362402375869}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.57372
[100]	validation_0-auc:0.68467
[200]	validation_0-auc:0.69464
[300]	validation_0-auc:0.69519
[317]	validation_0-auc:0.69587
[0]	validation_0-auc:0.62768
[100]	validation_0-auc:0.71055
[200]	validation_0-auc:0.71979
[282]	validation_0-auc:0.72071
[0]	validation_0-auc:0.58859
[100]	validation_0-auc:0.72204
[200]	validation_0-auc:0.72913
[215]	validation_0-auc:0.72886
[0]	validation_0-auc:0.58843
[100]	validation_0-auc:0.70564
[200]	validation_0-auc:0.71385
[300]	validation_0-auc:0.71929
[382]	validation_0-auc:0.71999
[0]	validation_0-auc:0.61959
[100]	validation_0-auc:0.72014
[200]	validation_0-auc:0.72264
[226]	validation_0-auc:0.72249


[I 2023-10-12 15:17:16,895] Trial 20 finished with value: -0.45291449804691475 and parameters: {'learning_rate': 0.08709537968789743, 'n_estimators': 1353, 'max_depth': 11, 'subsample': 0.8357592396781854, 'colsample_bytree': 0.6248810789231233, 'gamma': 0.37102265787001354}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.58574
[100]	validation_0-auc:0.68570
[200]	validation_0-auc:0.69519
[300]	validation_0-auc:0.69738
[400]	validation_0-auc:0.69760
[473]	validation_0-auc:0.69688
[0]	validation_0-auc:0.60533
[100]	validation_0-auc:0.70893
[200]	validation_0-auc:0.71672
[300]	validation_0-auc:0.71851
[359]	validation_0-auc:0.71787
[0]	validation_0-auc:0.58676
[100]	validation_0-auc:0.73340
[200]	validation_0-auc:0.73678
[270]	validation_0-auc:0.73578
[0]	validation_0-auc:0.59388
[100]	validation_0-auc:0.70631
[200]	validation_0-auc:0.71197
[300]	validation_0-auc:0.71495
[400]	validation_0-auc:0.71806
[458]	validation_0-auc:0.71721
[0]	validation_0-auc:0.60696
[100]	validation_0-auc:0.71792
[200]	validation_0-auc:0.72220
[300]	validation_0-auc:0.72412
[318]	validation_0-auc:0.72361


[I 2023-10-12 15:17:51,030] Trial 21 finished with value: -0.4527731500427626 and parameters: {'learning_rate': 0.08768629097462159, 'n_estimators': 1372, 'max_depth': 11, 'subsample': 0.8266905203899256, 'colsample_bytree': 0.6256993195530289, 'gamma': 0.36937087027286936}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.56913
[100]	validation_0-auc:0.68576
[200]	validation_0-auc:0.69524
[300]	validation_0-auc:0.69641
[303]	validation_0-auc:0.69640
[0]	validation_0-auc:0.62108
[100]	validation_0-auc:0.72092
[200]	validation_0-auc:0.72486
[300]	validation_0-auc:0.72864
[400]	validation_0-auc:0.72919
[414]	validation_0-auc:0.72861
[0]	validation_0-auc:0.61315
[100]	validation_0-auc:0.73051
[200]	validation_0-auc:0.73232
[238]	validation_0-auc:0.73387
[0]	validation_0-auc:0.58145
[100]	validation_0-auc:0.69941
[200]	validation_0-auc:0.70521
[300]	validation_0-auc:0.71024
[400]	validation_0-auc:0.71286
[500]	validation_0-auc:0.71343
[549]	validation_0-auc:0.71343
[0]	validation_0-auc:0.62679
[100]	validation_0-auc:0.71021
[200]	validation_0-auc:0.71475
[300]	validation_0-auc:0.71866
[400]	validation_0-auc:0.71967
[494]	validation_0-auc:0.71913


[I 2023-10-12 15:18:27,957] Trial 22 finished with value: -0.45352464336076287 and parameters: {'learning_rate': 0.08323160937111634, 'n_estimators': 1633, 'max_depth': 11, 'subsample': 0.8666830480483697, 'colsample_bytree': 0.6050598999371163, 'gamma': 0.3602074227145422}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.58700
[100]	validation_0-auc:0.68209
[200]	validation_0-auc:0.68862
[278]	validation_0-auc:0.68855
[0]	validation_0-auc:0.61913
[100]	validation_0-auc:0.71307
[200]	validation_0-auc:0.71927
[300]	validation_0-auc:0.72224
[371]	validation_0-auc:0.72151
[0]	validation_0-auc:0.64751
[100]	validation_0-auc:0.73500
[200]	validation_0-auc:0.73552
[221]	validation_0-auc:0.73538
[0]	validation_0-auc:0.58973
[100]	validation_0-auc:0.69868
[200]	validation_0-auc:0.70718
[300]	validation_0-auc:0.71127
[382]	validation_0-auc:0.71171
[0]	validation_0-auc:0.59871
[100]	validation_0-auc:0.71532
[200]	validation_0-auc:0.72170
[300]	validation_0-auc:0.72593
[400]	validation_0-auc:0.72680
[500]	validation_0-auc:0.72667
[561]	validation_0-auc:0.72561


[I 2023-10-12 15:19:05,315] Trial 23 finished with value: -0.45047012711586953 and parameters: {'learning_rate': 0.07081031418927757, 'n_estimators': 1611, 'max_depth': 11, 'subsample': 0.9103061578166558, 'colsample_bytree': 0.5626692450624043, 'gamma': 0.2976032642899215}. Best is trial 14 with value: -0.4573087610687489.


[0]	validation_0-auc:0.58551
[100]	validation_0-auc:0.67793
[200]	validation_0-auc:0.68961
[300]	validation_0-auc:0.69124
[332]	validation_0-auc:0.69084
[0]	validation_0-auc:0.62598
[100]	validation_0-auc:0.71826
[200]	validation_0-auc:0.72327
[292]	validation_0-auc:0.72388
[0]	validation_0-auc:0.59298
[100]	validation_0-auc:0.71057
[200]	validation_0-auc:0.72108
[300]	validation_0-auc:0.72435
[400]	validation_0-auc:0.72451
[458]	validation_0-auc:0.72354
[0]	validation_0-auc:0.60482
[100]	validation_0-auc:0.70017
[200]	validation_0-auc:0.71157
[300]	validation_0-auc:0.71616
[400]	validation_0-auc:0.71835
[500]	validation_0-auc:0.72059
[552]	validation_0-auc:0.72032
[0]	validation_0-auc:0.61720
[100]	validation_0-auc:0.71173
[200]	validation_0-auc:0.71508
[300]	validation_0-auc:0.71737
[354]	validation_0-auc:0.71631


[I 2023-10-12 15:19:37,327] Trial 24 finished with value: -0.4512647395849144 and parameters: {'learning_rate': 0.0885341418812912, 'n_estimators': 1046, 'max_depth': 10, 'subsample': 0.8517441031732926, 'colsample_bytree': 0.5700818516850485, 'gamma': 0.255117560957859}. Best is trial 14 with value: -0.4573087610687489.


Best parameters: {'learning_rate': 0.09101000886909859, 'n_estimators': 1199, 'max_depth': 11, 'subsample': 0.7635921298553625, 'colsample_bytree': 0.6682734311264994, 'gamma': 0.2921663159248171}
Best F1 score: 0.4573087610687489


In [21]:
# Обучение с лучшими гиперпараметрами на всем тренировочном наборе данных
best_model = XGBClassifier(**best_params, eval_metric='auc')
best_model.fit(X_train, y_train)

# Предсказания на тестовой выборке
y_prob_test = best_model.predict_proba(X_test)[:, 1]

# Определение оптимального порога
thresholds = np.linspace(0, 1, 300)
f1_scores_test = [f1_score(y_test, y_prob_test > thresh) for thresh in thresholds]
optimal_threshold_test = thresholds[np.argmax(f1_scores_test)]

# Применение оптимального порога для классификации
y_pred_test = [1 if prob >= optimal_threshold_test else 0 for prob in y_prob_test]

# Оценка качества модели с новым порогом на тестовой выборке
print("ROC AUC:", roc_auc_score(y_test, y_prob_test))
print("Average Precision:", average_precision_score(y_test, y_prob_test))
print("\nOptimal Threshold:", optimal_threshold_test)
print("\nClassification Report with Optimal Threshold:\n", classification_report(y_test, y_pred_test))

ROC AUC: 0.7358805112646052
Average Precision: 0.4476910804488899

Optimal Threshold: 0.06354515050167224

Classification Report with Optimal Threshold:
               precision    recall  f1-score   support

           0       0.89      0.71      0.79      2213
           1       0.36      0.65      0.47       560

    accuracy                           0.70      2773
   macro avg       0.63      0.68      0.63      2773
weighted avg       0.78      0.70      0.73      2773



In [22]:
import pickle

# Сохранение модели в файл
with open("best_xgb_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

    
# Загрузка модели из файла
# with open("best_xgb_model.pkl", "rb") as f:
#     loaded_model = pickle.load(f)
    

In [23]:
# Сохранение гиперпараметров в файл
with open("best_params.pkl", "wb") as f:
    pickle.dump(best_params, f)
    
# Загрузка гиперпараметров из файла
# with open("best_params.pkl", "rb") as f:
#     loaded_params = pickle.load(f)


In [None]:
# Использование сохраненной модели
import pickle

# Загрузка модели из файла
with open("best_xgb_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

# Предсказания с помощью загруженной модели
predictions = loaded_model.predict(new_data)  # где new_data - новые данные, на которых вы хотите сделать предсказания

#  Использование списка лучших гиперпараметров

import pickle
from xgboost import XGBClassifier

# Загрузка гиперпараметров из файла
with open("best_params.pkl", "rb") as f:
    loaded_params = pickle.load(f)

# Создание и обучение модели с загруженными гиперпараметрами
model_with_loaded_params = XGBClassifier(**loaded_params)
model_with_loaded_params.fit(X_train, y_train)

# Предсказания с помощью обученной модели
predictions = model_with_loaded_params.predict(new_data)
