# Регрессия: прогноз CC50

Построение моделей для предсказания CC50 по химическим дескрипторам.

In [1]:
import pandas as pd
import numpy as np
import re
import optuna

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

# Загрузка данных
df = pd.read_csv('/Users/rem/МИФИ/курсовая/dataset.csv')

# Подготовка данных: исключаем столбцы с целевыми переменными
features = [col for col in df.columns if col not in ["IC50, mM", "CC50, mM", "SI"]]
X = df[features]
y = df["CC50, mM"]

# Разбиение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Для линейной регрессии выполняем масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Для моделей на основе градиентного бустинга (LightGBM) необходимо очистить имена признаков от специальных символов
def clean_column(name):
    return re.sub(r'[^\w]', '_', name)

X_train_clean = X_train.rename(columns=lambda col: clean_column(col))
X_test_clean = X_test.rename(columns=lambda col: clean_column(col))

# ------------------------- Оптимизация гиперпараметров с использованием Optuna -------------------------

# 1. RandomForestRegressor
def objective_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=50)
    # Используем категориальный выбор: либо не ограничиваем глубину (None), либо ограничиваем значением 10
    max_depth = trial.suggest_categorical("max_depth", [None, 10])
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    return scores.mean()

study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(objective_rf, n_trials=20)
print("Random Forest best params:", study_rf.best_trial.params)
print("Random Forest best CV R2:", study_rf.best_trial.value)

# 2. XGBoost
def objective_xgb(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=50)
    max_depth = trial.suggest_categorical("max_depth", [3, 6])
    model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42,
                         use_label_encoder=False, eval_metric='rmse')
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    return scores.mean()

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=20)
print("XGBoost best params:", study_xgb.best_trial.params)
print("XGBoost best CV R2:", study_xgb.best_trial.value)

# 3. LightGBM
def objective_lgb(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=50)
    max_depth = trial.suggest_categorical("max_depth", [-1, 10])
    model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    # Для LightGBM используем очищенные признаки
    scores = cross_val_score(model, X_train_clean, y_train, cv=5, scoring='r2')
    return scores.mean()

study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=20)
print("LightGBM best params:", study_lgb.best_trial.params)
print("LightGBM best CV R2:", study_lgb.best_trial.value)

# ------------------------- Обучение финальных моделей с лучшими гиперпараметрами -------------------------

# Линейная регрессия (без оптимизации)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

# Random Forest с лучшими параметрами
best_rf = RandomForestRegressor(**study_rf.best_trial.params, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_rf = best_rf.predict(X_test)

# XGBoost с лучшими параметрами
best_xgb = XGBRegressor(**study_xgb.best_trial.params, random_state=42,
                        use_label_encoder=False, eval_metric='rmse')
best_xgb.fit(X_train, y_train)
y_pred_xgb = best_xgb.predict(X_test)

# LightGBM с лучшими параметрами (используем очищенные признаки)
best_lgb = LGBMRegressor(**study_lgb.best_trial.params, random_state=42)
best_lgb.fit(X_train_clean, y_train)
y_pred_lgb = best_lgb.predict(X_test_clean)

# ------------------------- Вывод результатов -------------------------
models = {
    "Linear Regression": y_pred_lr,
    "Random Forest": y_pred_rf,
    "XGBoost": y_pred_xgb,
    "LightGBM": y_pred_lgb
}

for name, preds in models.items():
    print(f"{name}: R2={r2_score(y_test, preds):.3f}, "
          f"MAE={mean_absolute_error(y_test, preds):.2f}, "
          f"MSE={mean_squared_error(y_test, preds):.2f}")

[I 2025-06-03 00:05:37,710] A new study created in memory with name: no-name-377a7c63-6faf-403e-a0a6-6a20cf1ea5ed
[I 2025-06-03 00:06:26,656] Trial 0 finished with value: 0.45848374598672026 and parameters: {'n_estimators': 200, 'max_depth': None}. Best is trial 0 with value: 0.45848374598672026.
[I 2025-06-03 00:07:29,909] Trial 1 finished with value: 0.4616395279421289 and parameters: {'n_estimators': 250, 'max_depth': None}. Best is trial 1 with value: 0.4616395279421289.
[I 2025-06-03 00:07:54,341] Trial 2 finished with value: 0.4635252293344244 and parameters: {'n_estimators': 100, 'max_depth': None}. Best is trial 2 with value: 0.4635252293344244.
[I 2025-06-03 00:08:17,998] Trial 3 finished with value: 0.4635252293344244 and parameters: {'n_estimators': 100, 'max_depth': None}. Best is trial 2 with value: 0.4635252293344244.
[I 2025-06-03 00:09:17,900] Trial 4 finished with value: 0.4616395279421289 and parameters: {'n_estimators': 250, 'max_depth': None}. Best is trial 2 with v

Random Forest best params: {'n_estimators': 100, 'max_depth': 10}
Random Forest best CV R2: 0.4698056217595649


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-06-03 00:18:07,636] Trial 0 finished with value: 0.3862361849254262 and parameters: {'n_estimators': 100, 'max_depth': 6}. Best is trial 0 with value: 0.3862361849254262.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.upda

XGBoost best params: {'n_estimators': 150, 'max_depth': 3}
XGBoost best CV R2: 0.4185000185539468
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info]

[I 2025-06-03 00:20:27,121] Trial 0 finished with value: 0.42701181929362225 and parameters: {'n_estimators': 200, 'max_depth': -1}. Best is trial 0 with value: 0.42701181929362225.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003492 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:20:30,982] Trial 1 finished with value: 0.42701181929362225 and parameters: {'n_estimators': 200, 'max_depth': -1}. Best is trial 0 with value: 0.42701181929362225.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002712 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-03 00:20:33,958] Trial 2 finished with value: 0.44933530055536863 and parameters: {'n_estimators': 200, 'max_depth': 10}. Best is trial 2 with value: 0.44933530055536863.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002951 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004006 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:20:37,216] Trial 3 finished with value: 0.44537436997358426 and parameters: {'n_estimators': 250, 'max_depth': 10}. Best is trial 2 with value: 0.44933530055536863.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003447 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:20:42,827] Trial 4 finished with value: 0.4182697157369203 and parameters: {'n_estimators': 300, 'max_depth': -1}. Best is trial 2 with value: 0.44933530055536863.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002753 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:20:44,590] Trial 5 finished with value: 0.43765519542779885 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 2 with value: 0.44933530055536863.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002806 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-03 00:20:46,632] Trial 6 finished with value: 0.43765519542779885 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 2 with value: 0.44933530055536863.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003268 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002918 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:20:49,244] Trial 7 finished with value: 0.44933530055536863 and parameters: {'n_estimators': 200, 'max_depth': 10}. Best is trial 2 with value: 0.44933530055536863.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002888 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:20:51,209] Trial 8 finished with value: 0.43765519542779885 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 2 with value: 0.44933530055536863.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001668 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-03 00:20:53,529] Trial 9 finished with value: 0.43765519542779885 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 2 with value: 0.44933530055536863.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002856 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:20:55,913] Trial 10 finished with value: 0.4519109621981068 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002504 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-03 00:20:57,721] Trial 11 finished with value: 0.4519109621981068 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-03 00:21:00,091] Trial 12 finished with value: 0.4519109621981068 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-03 00:21:02,293] Trial 13 finished with value: 0.4519109621981068 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003030 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002952 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:21:04,621] Trial 14 finished with value: 0.4519109621981068 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005696 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002657 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-03 00:21:06,729] Trial 15 finished with value: 0.4519109621981068 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004820 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003254 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:21:10,845] Trial 16 finished with value: 0.44537436997358426 and parameters: {'n_estimators': 250, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:21:13,335] Trial 17 finished with value: 0.4519109621981068 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003904 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002567 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:21:17,145] Trial 18 finished with value: 0.44537436997358426 and parameters: {'n_estimators': 250, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 588.844170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 591.013810
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-03 00:21:19,206] Trial 19 finished with value: 0.4519109621981068 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 10 with value: 0.4519109621981068.


LightGBM best params: {'n_estimators': 150, 'max_depth': 10}
LightGBM best CV R2: 0.4519109621981068


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18251
[LightGBM] [Info] Number of data points in the train set: 772, number of used features: 169
[LightGBM] [Info] Start training from score 596.122491
Linear Regression: R2=0.246, MAE=396.47, MSE=305790.11
Random Forest: R2=0.544, MAE=292.12, MSE=184965.80
XGBoost: R2=0.596, MAE=280.36, MSE=163709.70
LightGBM: R2=0.561, MAE=278.75, MSE=178177.66


Linear Regression

R² = 0.246
Модель объясняет около 24.6% разброса целевой переменной. Это довольно низкое значение, которое указывает, что линейная зависимость не способна полноценно уловить закономерности в данных.
MAE = 396.47
Средняя абсолютная ошибка показывает, что в среднем предсказания отклоняются от истинных значений примерно на 396.47 единиц.
MSE = 305790.11
Высокое значение MSE говорит о большом влиянии возможных выбросов или сильно ошибочных предсказаниях, поскольку ошибки возводятся в квадрат.
Вывод: Линейная регрессия в данном случае выступает как базовая модель и показывает слабое качество предсказаний, что может свидетельствовать о наличии сложных (например, нелинейных) взаимосвязей между признаками и целевой переменной.

Random Forest

R² = 0.544
Модель объясняет 54.4% дисперсии целевой переменной, что значительно лучше, чем у линейной регрессии. Это говорит о том, что ансамблевый метод (за счёт объединения множества деревьев) способен уловить больше информации из данных.
MAE = 292.12
Средняя абсолютная ошибка снизилась до 292.12, что говорит о более точных предсказаниях по сравнению с линейной моделью.
MSE = 184965.80
Значительное снижение MSE по сравнению с линейной регрессией указывает на уменьшение крупных ошибок.
Вывод: Случайный лес демонстрирует лучшую способность моделировать сложные зависимости, чем простая линейная регрессия, что приводит к более высоким значениям R² и меньшим ошибкам.

XGBoost

R² = 0.596
Модель объясняет около 59.6% дисперсии целевой переменной — это наилучший показатель из представленных.
MAE = 280.36
Средняя абсолютная ошибка ещё ниже, что свидетельствует о большей точности предсказаний.
MSE = 163709.70
Меньшее значение MSE говорит о снижении влияния крупных ошибок (выбросов) на итоговую ошибку модели.
Вывод: XGBoost показывает наилучшие результаты среди рассмотренных моделей. Высокий R², а также минимальные показатели ошибок указывают на его способность эффективно выявлять сложные нелинейные зависимости и взаимодействия между признаками.

LightGBM

R² = 0.561
Модель объясняет около 56.1% дисперсии целевой переменной, что также существенно выше линейной регрессии и немного уступает XGBoost.
MAE = 278.75
Средняя абсолютная ошибка почти идентична значению XGBoost, что говорит о хорошей точности на уровне средних ошибок.
MSE = 178177.66
Значение MSE немного выше, чем у XGBoost, что может указывать на наличие нескольких более крупных ошибок, хотя в целом результаты остаются приемлемыми.
Вывод: LightGBM тоже демонстрирует хорошие показатели и явно лучше линейной регрессии. Разница между LightGBM и XGBoost небольшая: при чуть лучшем MAE у LightGBM, XGBoost показывает наилучшее значение R² и немного меньший MSE.

Общая интерпретация

Улучшение по сравнению с линейной регрессией:
Ensemble-модели (Random Forest, XGBoost и LightGBM) значительно превосходят линейную регрессию по всем метрикам. Это говорит о том, что данные, скорее всего, содержат нелинейные зависимости и взаимодействия между признаками, которые простая линейная модель не может адекватно описать.

Сравнение ансамблевых методов:

XGBoost показывает наилучшие результаты (наибольшее значение R² и минимальные показатели ошибок), что свидетельствует о его способности наиболее эффективно улавливать сложные зависимости.
LightGBM демонстрирует схожее качество предсказаний, с чуть меньшим R², но немного лучшим MAE.
Random Forest также улучшает показатели по сравнению с линейной регрессией, хотя результаты немного уступают XGBoost и LightGBM.

Заключение:
В рассматриваемой задаче ансамблевые модели значительно превосходят линейную регрессию, что подчеркивает важность выбора модели, способной учитывать нелинейные взаимосвязи в данных. XGBoost выглядит наиболее предпочтительным, но разница между ансамблевыми методами не столь велика, и окончательный выбор может зависеть от специфики задачи, интерпретируемости модели или требований к скорости обучения и прогнозирования.