In [1]:
import pandas as pd
import numpy as np

# 12. Модификации бустинга и ускорение обучения

## 12.1 XGBoost

## 12.2 CatBoost

## Загрузка данных

In [2]:
from catboost.datasets import titanic
import numpy as np

train_df, test_df = titanic()

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Обработка данных

In [3]:
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

Заполним пропуски в данных некоторым уникальным значением (есть и другие техники, но здесь для простоты используем эту).

In [4]:
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Разбиваем данные на матрицу объект-признак и вектор с целевой переменной.

In [5]:
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

Посмотрим на типы признаков.

In [6]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != float)[0]

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [7]:
categorical_features_indices

array([ 0,  1,  2,  3,  5,  6,  7,  9, 10], dtype=int64)

Разобъем данные на трейн и валидацию.

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = test_df

## Обучение модели

In [9]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score

In [10]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

В google colab нет возможности отрисовывать динамические графики (насколько мы знаем), поэтому для отрисовки графиков запускайте ноутбук на локальной машине.

In [11]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    logging_level='Verbose',
   plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.028683
0:	learn: 0.6739988	test: 0.6742630	best: 0.6742630 (0)	total: 165ms	remaining: 2m 45s
1:	learn: 0.6589013	test: 0.6592240	best: 0.6592240 (1)	total: 180ms	remaining: 1m 29s
2:	learn: 0.6421502	test: 0.6426778	best: 0.6426778 (2)	total: 208ms	remaining: 1m 9s
3:	learn: 0.6297276	test: 0.6302310	best: 0.6302310 (3)	total: 235ms	remaining: 58.6s
4:	learn: 0.6147184	test: 0.6198228	best: 0.6198228 (4)	total: 274ms	remaining: 54.6s
5:	learn: 0.6017730	test: 0.6073627	best: 0.6073627 (5)	total: 297ms	remaining: 49.3s
6:	learn: 0.5885309	test: 0.5956000	best: 0.5956000 (6)	total: 319ms	remaining: 45.3s
7:	learn: 0.5783200	test: 0.5858523	best: 0.5858523 (7)	total: 344ms	remaining: 42.6s
8:	learn: 0.5665895	test: 0.5743842	best: 0.5743842 (8)	total: 369ms	remaining: 40.6s
9:	learn: 0.5575381	test: 0.5662283	best: 0.5662283 (9)	total: 392ms	remaining: 38.8s
10:	learn: 0.5491045	test: 0.5575176	best: 0.5575176 (10)	total: 421ms	remaining: 37.8s
11:	learn: 0.5423887

<catboost.core.CatBoostClassifier at 0x1d071144a10>

## Кросс-валидация

In [12]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    logging_level='Silent',
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Посмотрим на среднее качество и его разброс по кросс-валидации

In [13]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.83±0.02 on step 355


## Применяем обученную модель

In [14]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 0 0 0 1 0 1 0 1 0]
[[0.85473931 0.14526069]
 [0.76313031 0.23686969]
 [0.88972889 0.11027111]
 [0.87876173 0.12123827]
 [0.3611047  0.6388953 ]
 [0.90513381 0.09486619]
 [0.33434185 0.66565815]
 [0.78468564 0.21531436]
 [0.39429048 0.60570952]
 [0.94047549 0.05952451]]


## Улучшение предсказаний и другие возможности CatBoost

### Early Stopping

In [15]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    early_stopping_rounds = 30,
    logging_level='Verbose',  # you can uncomment this for text output
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.028683
0:	learn: 0.6739988	test: 0.6742630	best: 0.6742630 (0)	total: 19ms	remaining: 19s
1:	learn: 0.6589013	test: 0.6592240	best: 0.6592240 (1)	total: 28.8ms	remaining: 14.4s
2:	learn: 0.6421502	test: 0.6426778	best: 0.6426778 (2)	total: 47.3ms	remaining: 15.7s
3:	learn: 0.6297276	test: 0.6302310	best: 0.6302310 (3)	total: 63.1ms	remaining: 15.7s
4:	learn: 0.6147184	test: 0.6198228	best: 0.6198228 (4)	total: 82ms	remaining: 16.3s
5:	learn: 0.6017730	test: 0.6073627	best: 0.6073627 (5)	total: 102ms	remaining: 16.8s
6:	learn: 0.5885309	test: 0.5956000	best: 0.5956000 (6)	total: 123ms	remaining: 17.4s
7:	learn: 0.5783200	test: 0.5858523	best: 0.5858523 (7)	total: 149ms	remaining: 18.4s
8:	learn: 0.5665895	test: 0.5743842	best: 0.5743842 (8)	total: 169ms	remaining: 18.6s
9:	learn: 0.5575381	test: 0.5662283	best: 0.5662283 (9)	total: 190ms	remaining: 18.8s
10:	learn: 0.5491045	test: 0.5575176	best: 0.5575176 (10)	total: 210ms	remaining: 18.9s
11:	learn: 0.5423887	te

<catboost.core.CatBoostClassifier at 0x1d071144a10>

In [16]:
model.tree_count_

284

Получили непереобученную модель, причем не пришлось ждать 1000 итераций!

## Важность признаков

CatBoost поддерживает несколько способов вычисления важности признаков, в том числе широко применяемый сейчас подход Shap (про него поговорим в следующих модулях).

In [17]:
feature_importances = model.get_feature_importance()

feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 28.377591527551807
Pclass: 17.450379813673287
Parch: 10.276200044515498
Embarked: 8.761954037905873
Cabin: 8.281577549519366
SibSp: 7.950157281933982
Age: 7.842375602284015
Ticket: 5.620556803330714
Fare: 5.439207339285509
PassengerId: 0.0
Name: 0.0


## Сохранение модели

In [18]:
# сохраняем модель
model.save_model('catboost_model.dump')

# загружаем сохраненную модель
model.load_model('catboost_model.dump');

## 12.3 LightGBM

## Загрузка данных и импорт библиотек

In [19]:
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import r2_score

In [20]:
RANDOM_STATE = 42

In [21]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing(as_frame=True)

X = data.data
y = data.target

In [22]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


## Сравнение моделей с гиперпараметрами по умолчанию

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

gbm = GradientBoostingRegressor()

cross_val_score(gbm, X, y, cv=3, scoring='r2').mean()

0.6800604566123686

In [24]:
from xgboost import XGBRegressor

xgb = XGBRegressor()

# cross_val_score(xgb, X, y, cv=3, scoring='r2').mean()
print(0.66)

0.66


In [25]:
from catboost import CatBoostRegressor

cb = CatBoostRegressor(verbose=0)

cross_val_score(cb, X, y, cv=3, scoring='r2').mean()

0.7142210654701769

In [26]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor()

cross_val_score(lgbm, X, y, cv=3, scoring='r2').mean()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 13760, number of used features: 8
[LightGBM] [Info] Start training from score 2.117384
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 13760, number of used features: 8
[LightGBM] [Info] Start training from score 2.079973
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 13760, number of used features: 8
[LightGBM] [Info] Start traini

0.7016238052098068

## Подбор гиперпараметров

Разобъем данные на тренировочную и тестовую часть. На тренировочной части по кросс-валидации подберем гиперпараметры моделей, а затем проверим качество на тестовой части.

In [27]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

params = {'max_depth' : [2, 5, 8, 11]}

In [28]:
# gs_xgb = GridSearchCV(xgb, params, cv=3, scoring='r2', verbose=2)

# gs_xgb.fit(Xtrain, ytrain)

In [29]:
# pred_xgb = gs_xgb.best_estimator_.predict(Xtest)

# r2_score(ytest, pred_xgb)

In [30]:
gs_cb = GridSearchCV(cb, params, cv=3, scoring='r2', verbose=2)

gs_cb.fit(X, y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ........................................max_depth=2; total time=   1.0s
[CV] END ........................................max_depth=2; total time=   1.1s
[CV] END ........................................max_depth=2; total time=   1.0s
[CV] END ........................................max_depth=5; total time=   1.9s
[CV] END ........................................max_depth=5; total time=   2.1s
[CV] END ........................................max_depth=5; total time=   1.9s
[CV] END ........................................max_depth=8; total time=   6.6s
[CV] END ........................................max_depth=8; total time=   5.8s
[CV] END ........................................max_depth=8; total time=   6.0s
[CV] END .......................................max_depth=11; total time=  47.6s
[CV] END .......................................max_depth=11; total time=  44.2s
[CV] END .......................................m

In [31]:
pred_cb = gs_cb.best_estimator_.predict(Xtest)

r2_score(ytest, pred_cb)

0.8911533719179447

In [32]:
gs_lgbm = GridSearchCV(lgbm, params, cv=3, scoring='r2', verbose=2)

gs_lgbm.fit(X, y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 13760, number of used features: 8
[LightGBM] [Info] Start training from score 2.117384
[CV] END ........................................max_depth=2; total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 13760, number of used features: 8
[LightGBM] [Info] Start training from score 2.079973
[CV] END ........................................max_depth=2; total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000316 s

In [33]:
pred_lgbm = gs_lgbm.best_estimator_.predict(Xtest)

r2_score(ytest, pred_lgbm)

0.876891981387784

Мы видим, что даже на маленьком датасете и при подборе одного гиперпараметра приходится подождать результатов. А если датасет больше? И гиперпараметров много, и их для достижения оптимального результата нужно подбирать одновременно!

При этом подбор гиперпараметров сильно улучшает качество моделей!

Что же делать, чтобы не ждать вечность, пока ищутся гиперпараметры? Узнаете в следующем уроке :)

## 12.4 Optuna

1. Определяем целевую функцию objective, через аргументы она будет получать специальный объект trial. С его помощью можно назначать различные гипермараметры, Например, как в примере ниже, мы задаем x в интервале [-10,10].

2. Далее создаем объект обучения с помощью метода optuna.create_study.

3. Запускаем оптимизацию целевой функции objective на 10 итераций n_trials=10. Происходит 10 вызовов нашей функции с различными параметрами от -10 до 10. Какие именно параметры выбирает optuna будет описано ниже.

In [34]:
import optuna

def objective(trial):
    x = trial.suggest_float('x', -10, 10)
    return (x - 2) ** 2

study = optuna.create_study()
study.optimize(objective, n_trials=40)

study.best_params

[I 2025-05-01 19:14:13,135] A new study created in memory with name: no-name-2405bcf8-e6e7-49f4-9c2a-90c75441e91f
[I 2025-05-01 19:14:13,137] Trial 0 finished with value: 8.311795157211652 and parameters: {'x': 4.8830184108346675}. Best is trial 0 with value: 8.311795157211652.
[I 2025-05-01 19:14:13,139] Trial 1 finished with value: 0.33747312996763706 and parameters: {'x': 2.5809243754290545}. Best is trial 1 with value: 0.33747312996763706.
[I 2025-05-01 19:14:13,140] Trial 2 finished with value: 117.25306946759628 and parameters: {'x': -8.828345647770774}. Best is trial 1 with value: 0.33747312996763706.
[I 2025-05-01 19:14:13,141] Trial 3 finished with value: 68.77891111771666 and parameters: {'x': -6.293305198635624}. Best is trial 1 with value: 0.33747312996763706.
[I 2025-05-01 19:14:13,144] Trial 4 finished with value: 1.8678719713725125 and parameters: {'x': 3.366701127303447}. Best is trial 1 with value: 0.33747312996763706.
[I 2025-05-01 19:14:13,145] Trial 5 finished with 

{'x': 1.9819445523007144}

## Загрузка данных и импорт библиотек

In [35]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import r2_score

from sklearn.datasets import fetch_california_housing

In [36]:
RANDOM_STATE = 42

In [37]:
from lightgbm import LGBMRegressor

In [38]:
data = fetch_california_housing(as_frame=True)

X = data.data
y = data.target

In [39]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

## Подбор гиперпараметров с Optuna

Разобъем данные на тренировочную и тестовую часть. На тренировочной части по кросс-валидации подберем гиперпараметры моделей, а затем проверим качество на тестовой части.

In [40]:
def objective_lgbm(trial):
    max_depth = trial.suggest_int("max_depth", 2, 20)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)

    score = cross_val_score(LGBMRegressor(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators),
                            Xtrain, ytrain, cv=3, scoring='r2', n_jobs=-1).mean()
    return score


study = optuna.create_study(direction="maximize")
study.optimize(objective_lgbm, n_trials=30)

[I 2025-05-01 19:14:13,355] A new study created in memory with name: no-name-05caeac1-b1a0-4de5-82b1-09801e2be928
[I 2025-05-01 19:14:16,857] Trial 0 finished with value: 0.8236256053105451 and parameters: {'max_depth': 3, 'learning_rate': 0.04423964206785456, 'n_estimators': 885}. Best is trial 0 with value: 0.8236256053105451.
[I 2025-05-01 19:14:19,871] Trial 1 finished with value: 0.1793735514756751 and parameters: {'max_depth': 17, 'learning_rate': 0.0005274736102177568, 'n_estimators': 305}. Best is trial 0 with value: 0.8236256053105451.
[I 2025-05-01 19:14:20,347] Trial 2 finished with value: 0.14776629550220766 and parameters: {'max_depth': 2, 'learning_rate': 0.00027533286774492955, 'n_estimators': 724}. Best is trial 0 with value: 0.8236256053105451.
[I 2025-05-01 19:14:21,293] Trial 3 finished with value: 0.6748428646050376 and parameters: {'max_depth': 7, 'learning_rate': 0.0035063571983229265, 'n_estimators': 388}. Best is trial 0 with value: 0.8236256053105451.
[I 2025-0

In [41]:
study.best_params

{'max_depth': 9, 'learning_rate': 0.04678339148080535, 'n_estimators': 936}

In [42]:
model = LGBMRegressor(**study.best_params)
model.fit(Xtrain, ytrain)

pred = model.predict(Xtest)

r2_score(ytest, pred)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000385 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 15480, number of used features: 8
[LightGBM] [Info] Start training from score 2.070349


0.8561267871618996

# Прогнозируем задержки самолетов

In [43]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
import pandas as pd

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [44]:
RANDOM_STATE = 111
DATASET_PATH = 'data/flight_delays_train2.csv'

In [45]:
data = pd.read_csv(DATASET_PATH)

X = data.drop('dep_delayed_15min', axis=1)
y = data['dep_delayed_15min'] == 'Y'

X.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732
1,c-4,c-20,c-3,1548,US,PIT,MCO,834
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423


Создайте список номеров колонок с категориальными признаками для бустингов

## Quiz
Какой длины получился список?

(подсказка: колонка `DepTime` числовая)

In [46]:
print(X.dtypes)
categorical_features_indices = np.where(X.dtypes == 'object')[0]
len(categorical_features_indices)

Month            object
DayofMonth       object
DayOfWeek        object
DepTime           int64
UniqueCarrier    object
Origin           object
Dest             object
Distance          int64
dtype: object


6

Разобъем данные на обучение и контроль

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

In [48]:
X_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
41207,c-4,c-18,c-1,1457,CO,EWR,TPA,998
28283,c-11,c-1,c-2,1225,UA,DEN,BOS,1754
34619,c-6,c-16,c-5,1650,YV,IAD,CAE,401
8789,c-5,c-18,c-4,923,AA,SLC,DFW,988
38315,c-2,c-14,c-2,1839,AA,STL,SAN,1558


## Модели с параметрами по умолчанию

Обучите CatBoost с гиперпараметрами по умолчанию.

## Quiz
Чему равен ROC-AUC на тестовых данных? Ответ округлите до сотых.

In [49]:
data.dtypes

Month                object
DayofMonth           object
DayOfWeek            object
DepTime               int64
UniqueCarrier        object
Origin               object
Dest                 object
Distance              int64
dep_delayed_15min    object
dtype: object

In [50]:
# your code here

cb_clf = CatBoostClassifier(random_state=RANDOM_STATE)
cb_clf.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_test, y_test),
    logging_level='Silent',
    plot=True
)

y_pred = cb_clf.predict_proba(Xtest)[:, 1]
roc_auc_score(y_test, y_pred)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=0]=1.6812 : cat_features must be integer or string, real number values and NaN values should be converted to string.

Обучите LightGBM с гиперпараметрами по умолчанию.

## Quiz
Чему равен ROC-AUC на тестовых данных? Ответ округлите до сотых.

In [None]:
for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X_train[c] = X_train[c].astype('category')
        X_test[c] = X_test[c].astype('category')

In [None]:
# your code here

lgbm = LGBMClassifier(random_state=RANDOM_STATE)

lgbm.fit(X_train, y_train)
y_pred = lgbm.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_pred)

[LightGBM] [Info] Number of positive: 14346, number of negative: 60654
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002075 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.191280 -> initscore=-1.441714
[LightGBM] [Info] Start training from score -1.441714


0.7341149074685321

## Optuna

Выделим дополнительную валидационную выборку.

In [None]:
Xtrain_new, Xval, ytrain_new, yval = train_test_split(Xtrain, ytrain, test_size=0.25, random_state=RANDOM_STATE)

Создайте функцию objective_lgbm, в которой среди гиперпараметров

* num_leaves = trial.suggest_int("num_leaves", 10, 100)
* n_estimators = trial.suggest_int("n_estimators", 10, 1000)

подберите оптимальные, обучая LGBM на Xtrain_new, ytrain_new и проверяя качество (ROC-AUC) на Xval.

Используйте 30 эпох обучения Optuna.


In [None]:
# your code here

import optuna

def objective_lgbm(trial):
    num_leaves = trial.suggest_int("num_leaves", 10, 100)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    
    lgbm = LGBMClassifier(num_leaves=num_leaves, n_estimators=n_estimators, random_state=RANDOM_STATE)
    lgbm.fit(Xtrain_new, ytrain_new)

    y_pred = lgbm.predict_proba(Xval)[:, 1]
    roc_auc = roc_auc_score(yval, y_pred)
    return roc_auc

# storage = optuna.storages.InMemoryStorage()
study = optuna.create_study(direction='maximize')

# Используйте 30 эпох обучения Optuna.
study.optimize(objective_lgbm, n_trials=30)
study.best_params

[I 2025-05-01 19:05:24,183] A new study created in memory with name: no-name-22837354-6ad2-492f-80a1-2e5cfe4f7eb3


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000526 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:26,409] Trial 0 finished with value: 0.7154861262982839 and parameters: {'num_leaves': 48, 'n_estimators': 859}. Best is trial 0 with value: 0.7154861262982839.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:28,158] Trial 1 finished with value: 0.7185645987292283 and parameters: {'num_leaves': 84, 'n_estimators': 318}. Best is trial 1 with value: 0.7185645987292283.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:29,838] Trial 2 finished with value: 0.7188378399279125 and parameters: {'num_leaves': 57, 'n_estimators': 401}. Best is trial 2 with value: 0.7188378399279125.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:32,406] Trial 3 finished with value: 0.7132068565066527 and parameters: {'num_leaves': 45, 'n_estimators': 918}. Best is trial 2 with value: 0.7188378399279125.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:34,219] Trial 4 finished with value: 0.717548528133921 and parameters: {'num_leaves': 35, 'n_estimators': 937}. Best is trial 2 with value: 0.7188378399279125.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001341 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:35,854] Trial 5 finished with value: 0.7168407469964483 and parameters: {'num_leaves': 64, 'n_estimators': 503}. Best is trial 2 with value: 0.7188378399279125.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:37,253] Trial 6 finished with value: 0.7183328380040954 and parameters: {'num_leaves': 43, 'n_estimators': 685}. Best is trial 2 with value: 0.7188378399279125.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:38,048] Trial 7 finished with value: 0.721887787680789 and parameters: {'num_leaves': 67, 'n_estimators': 222}. Best is trial 7 with value: 0.721887787680789.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:40,023] Trial 8 finished with value: 0.7194399098145067 and parameters: {'num_leaves': 72, 'n_estimators': 644}. Best is trial 7 with value: 0.721887787680789.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:41,877] Trial 9 finished with value: 0.713838273371451 and parameters: {'num_leaves': 32, 'n_estimators': 929}. Best is trial 7 with value: 0.721887787680789.
[I 2025-05-01 19:05:41,961] Trial 10 finished with value: 0.7168694544078796 and parameters: {'num_leaves': 12, 'n_estimators': 15}. Best is trial 7 with value: 0.721887787680789.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000363 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001676 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[Lig

[I 2025-05-01 19:05:42,814] Trial 11 finished with value: 0.7212498289615716 and parameters: {'num_leaves': 78, 'n_estimators': 188}. Best is trial 7 with value: 0.721887787680789.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000248 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:43,495] Trial 12 finished with value: 0.7238992800013098 and parameters: {'num_leaves': 100, 'n_estimators': 140}. Best is trial 12 with value: 0.7238992800013098.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:44,021] Trial 13 finished with value: 0.7231005524687424 and parameters: {'num_leaves': 99, 'n_estimators': 109}. Best is trial 12 with value: 0.7238992800013098.
[I 2025-05-01 19:05:44,244] Trial 14 finished with value: 0.7256291071150817 and parameters: {'num_leaves': 100, 'n_estimators': 37}. Best is trial 14 with value: 0.7256291071150817.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [b

[I 2025-05-01 19:05:44,753] Trial 15 finished with value: 0.7255241267976578 and parameters: {'num_leaves': 100, 'n_estimators': 77}. Best is trial 14 with value: 0.7256291071150817.
[I 2025-05-01 19:05:44,895] Trial 16 finished with value: 0.7253450755112734 and parameters: {'num_leaves': 88, 'n_estimators': 21}. Best is trial 14 with value: 0.7256291071150817.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000300 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [b

[I 2025-05-01 19:05:46,079] Trial 17 finished with value: 0.7182208224521706 and parameters: {'num_leaves': 91, 'n_estimators': 316}. Best is trial 14 with value: 0.7256291071150817.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:47,108] Trial 18 finished with value: 0.7202379795069649 and parameters: {'num_leaves': 79, 'n_estimators': 284}. Best is trial 14 with value: 0.7256291071150817.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:47,590] Trial 19 finished with value: 0.721327490641128 and parameters: {'num_leaves': 93, 'n_estimators': 87}. Best is trial 14 with value: 0.7256291071150817.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:48,120] Trial 20 finished with value: 0.7199725410229092 and parameters: {'num_leaves': 11, 'n_estimators': 443}. Best is trial 14 with value: 0.7256291071150817.
[I 2025-05-01 19:05:48,300] Trial 21 finished with value: 0.7243603528245023 and parameters: {'num_leaves': 89, 'n_estimators': 30}. Best is trial 14 with value: 0.7256291071150817.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [b

[I 2025-05-01 19:05:48,459] Trial 22 finished with value: 0.7259609600401604 and parameters: {'num_leaves': 100, 'n_estimators': 23}. Best is trial 22 with value: 0.7259609600401604.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:49,318] Trial 23 finished with value: 0.7226525268077153 and parameters: {'num_leaves': 99, 'n_estimators': 202}. Best is trial 22 with value: 0.7259609600401604.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:49,848] Trial 24 finished with value: 0.7219153073253566 and parameters: {'num_leaves': 79, 'n_estimators': 108}. Best is trial 22 with value: 0.7259609600401604.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:50,848] Trial 25 finished with value: 0.7214728184852486 and parameters: {'num_leaves': 94, 'n_estimators': 239}. Best is trial 22 with value: 0.7259609600401604.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001677 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:51,630] Trial 26 finished with value: 0.7207023415307033 and parameters: {'num_leaves': 84, 'n_estimators': 148}. Best is trial 22 with value: 0.7259609600401604.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:53,372] Trial 27 finished with value: 0.717775208871544 and parameters: {'num_leaves': 74, 'n_estimators': 609}. Best is trial 22 with value: 0.7259609600401604.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001609 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:53,820] Trial 28 finished with value: 0.7254386258568002 and parameters: {'num_leaves': 100, 'n_estimators': 71}. Best is trial 22 with value: 0.7259609600401604.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2025-05-01 19:05:55,858] Trial 29 finished with value: 0.7165355274591232 and parameters: {'num_leaves': 56, 'n_estimators': 737}. Best is trial 22 with value: 0.7259609600401604.


{'num_leaves': 100, 'n_estimators': 23}

{'num_leaves': 100, 'n_estimators': 23}

Обучите модель с найденными гиперпараметрами на Xtrain, ytrain и оцените ROC-AUC на тестовых данных.

In [None]:
# your code here
lgbm = LGBMClassifier(**study.best_params)
lgbm.fit(Xtrain, ytrain)
y_pred = lgbm.predict_proba(Xtest)[:, 1]
roc_auc_score(ytest, y_pred)

[LightGBM] [Info] Number of positive: 14346, number of negative: 60654
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000382 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.191280 -> initscore=-1.441714
[LightGBM] [Info] Start training from score -1.441714


0.7327851727128114

## Quiz

Чему равно количество деревьев в LGBM после подбора гиперпараметров?

## Работа над улучшением модели

* Попробуйте при помощи Optuna подобрать и другие гиперпарамтеры
* Также подберите гиперпараметры у CatBoost (а не только у LightGBM)

In [None]:
# your code here

## Quiz

Поделитесь своими результатами!