# Регрессия: прогноз IC50

Построение моделей для предсказания IC50 по химическим дескрипторам.

In [8]:
import pandas as pd
import numpy as np
import re
import optuna

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

# Загрузка данных
df = pd.read_csv('/Users/rem/МИФИ/курсовая/dataset.csv')

# Подготовка данных: исключаем столбцы с целевыми переменными
features = [col for col in df.columns if col not in ["IC50, mM", "CC50, mM", "SI"]]
X = df[features]
y = df["IC50, mM"]

# Разбиение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Для линейной регрессии выполняем масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Для моделей на основе градиентного бустинга (LightGBM) необходимо очистить имена признаков от специальных символов
def clean_column(name):
    return re.sub(r'[^\w]', '_', name)

X_train_clean = X_train.rename(columns=lambda col: clean_column(col))
X_test_clean = X_test.rename(columns=lambda col: clean_column(col))

# ------------------------- Оптимизация гиперпараметров с использованием Optuna -------------------------

# 1. RandomForestRegressor
def objective_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=50)
    # Используем категориальный выбор: либо не ограничиваем глубину (None), либо ограничиваем значением 10
    max_depth = trial.suggest_categorical("max_depth", [None, 10])
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    return scores.mean()

study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(objective_rf, n_trials=20)
print("Random Forest best params:", study_rf.best_trial.params)
print("Random Forest best CV R2:", study_rf.best_trial.value)

# 2. XGBoost
def objective_xgb(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=50)
    max_depth = trial.suggest_categorical("max_depth", [3, 6])
    model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42,
                         use_label_encoder=False, eval_metric='rmse')
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    return scores.mean()

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=20)
print("XGBoost best params:", study_xgb.best_trial.params)
print("XGBoost best CV R2:", study_xgb.best_trial.value)

# 3. LightGBM
def objective_lgb(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=50)
    max_depth = trial.suggest_categorical("max_depth", [-1, 10])
    model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    # Для LightGBM используем очищенные признаки
    scores = cross_val_score(model, X_train_clean, y_train, cv=5, scoring='r2')
    return scores.mean()

study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=20)
print("LightGBM best params:", study_lgb.best_trial.params)
print("LightGBM best CV R2:", study_lgb.best_trial.value)

# ------------------------- Обучение финальных моделей с лучшими гиперпараметрами -------------------------

# Линейная регрессия (без оптимизации)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

# Random Forest с лучшими параметрами
best_rf = RandomForestRegressor(**study_rf.best_trial.params, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_rf = best_rf.predict(X_test)

# XGBoost с лучшими параметрами
best_xgb = XGBRegressor(**study_xgb.best_trial.params, random_state=42,
                        use_label_encoder=False, eval_metric='rmse')
best_xgb.fit(X_train, y_train)
y_pred_xgb = best_xgb.predict(X_test)

# LightGBM с лучшими параметрами (используем очищенные признаки)
best_lgb = LGBMRegressor(**study_lgb.best_trial.params, random_state=42)
best_lgb.fit(X_train_clean, y_train)
y_pred_lgb = best_lgb.predict(X_test_clean)

# ------------------------- Вывод результатов -------------------------
models = {
    "Linear Regression": y_pred_lr,
    "Random Forest": y_pred_rf,
    "XGBoost": y_pred_xgb,
    "LightGBM": y_pred_lgb
}

for name, preds in models.items():
    print(f"{name}: R2={r2_score(y_test, preds):.3f}, "
          f"MAE={mean_absolute_error(y_test, preds):.2f}, "
          f"MSE={mean_squared_error(y_test, preds):.2f}")

[I 2025-06-02 23:30:36,169] A new study created in memory with name: no-name-ab1b121a-58f7-44e0-9274-15abdc6b57e2
[I 2025-06-02 23:31:09,125] Trial 0 finished with value: 0.24935422646294741 and parameters: {'n_estimators': 100, 'max_depth': None}. Best is trial 0 with value: 0.24935422646294741.
[I 2025-06-02 23:31:37,150] Trial 1 finished with value: 0.25792739545336696 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 1 with value: 0.25792739545336696.
[I 2025-06-02 23:32:05,392] Trial 2 finished with value: 0.25792739545336696 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 1 with value: 0.25792739545336696.
[I 2025-06-02 23:32:53,612] Trial 3 finished with value: 0.2557607556764837 and parameters: {'n_estimators': 250, 'max_depth': 10}. Best is trial 1 with value: 0.25792739545336696.
[I 2025-06-02 23:33:21,465] Trial 4 finished with value: 0.25792739545336696 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 1 with val

Random Forest best params: {'n_estimators': 150, 'max_depth': None}
Random Forest best CV R2: 0.2608731271405576


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-06-02 23:49:03,234] Trial 0 finished with value: 0.1138588632060656 and parameters: {'n_estimators': 100, 'max_depth': 3}. Best is trial 0 with value: 0.1138588632060656.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.upda

XGBoost best params: {'n_estimators': 100, 'max_depth': 3}
XGBoost best CV R2: 0.1138588632060656
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002545 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info]

[I 2025-06-02 23:51:15,638] Trial 0 finished with value: 0.17694754610396018 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 0 with value: 0.17694754610396018.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003837 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:19,521] Trial 1 finished with value: 0.1518699385331192 and parameters: {'n_estimators': 250, 'max_depth': -1}. Best is trial 0 with value: 0.17694754610396018.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:22,998] Trial 2 finished with value: 0.1518699385331192 and parameters: {'n_estimators': 250, 'max_depth': -1}. Best is trial 0 with value: 0.17694754610396018.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003688 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003388 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:26,107] Trial 3 finished with value: 0.13652652330445947 and parameters: {'n_estimators': 300, 'max_depth': 10}. Best is trial 0 with value: 0.17694754610396018.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001303 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-02 23:51:29,836] Trial 4 finished with value: 0.1518699385331192 and parameters: {'n_estimators': 250, 'max_depth': -1}. Best is trial 0 with value: 0.17694754610396018.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003526 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:33,128] Trial 5 finished with value: 0.13652652330445947 and parameters: {'n_estimators': 300, 'max_depth': 10}. Best is trial 0 with value: 0.17694754610396018.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002592 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002738 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-02 23:51:35,694] Trial 6 finished with value: 0.1801074349111586 and parameters: {'n_estimators': 150, 'max_depth': -1}. Best is trial 6 with value: 0.1801074349111586.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:37,320] Trial 7 finished with value: 0.20348400811320913 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003927 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:38,689] Trial 8 finished with value: 0.19788973341796218 and parameters: {'n_estimators': 100, 'max_depth': 10}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Total Bins 14810
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 165
[LightGBM] [Info] Start training from score 220.322813
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001356 seconds.
You can set `force_row_wis

[I 2025-06-02 23:51:41,886] Trial 9 finished with value: 0.13652652330445947 and parameters: {'n_estimators': 300, 'max_depth': 10}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:43,823] Trial 10 finished with value: 0.20348400811320913 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002349 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:45,708] Trial 11 finished with value: 0.20348400811320913 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:48,221] Trial 12 finished with value: 0.1801074349111586 and parameters: {'n_estimators': 150, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003975 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002435 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:49,932] Trial 13 finished with value: 0.20348400811320913 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:52,927] Trial 14 finished with value: 0.16249139858459313 and parameters: {'n_estimators': 200, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005519 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-02 23:51:55,381] Trial 15 finished with value: 0.1801074349111586 and parameters: {'n_estimators': 150, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:51:57,012] Trial 16 finished with value: 0.20348400811320913 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002530 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002411 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:52:00,057] Trial 17 finished with value: 0.16249139858459313 and parameters: {'n_estimators': 200, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.153124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002846 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003812 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the trai

[I 2025-06-02 23:52:03,124] Trial 18 finished with value: 0.16249139858459313 and parameters: {'n_estimators': 200, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 217.245042
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 215.611964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start

[I 2025-06-02 23:52:04,761] Trial 19 finished with value: 0.20348400811320913 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 7 with value: 0.20348400811320913.


LightGBM best params: {'n_estimators': 100, 'max_depth': -1}
LightGBM best CV R2: 0.20348400811320913


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003391 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18251
[LightGBM] [Info] Number of data points in the train set: 772, number of used features: 169
[LightGBM] [Info] Start training from score 219.756609
Linear Regression: R2=0.084, MAE=262.15, MSE=181735.92
Random Forest: R2=0.410, MAE=205.98, MSE=116945.44
XGBoost: R2=0.278, MAE=214.46, MSE=143219.01
LightGBM: R2=0.382, MAE=204.79, MSE=122551.46


In [12]:
1. Linear Regression

R² = 0.084:
Это означает, что модель объясняет только 8.4% дисперсии зависимой переменной. Такой низкий R² указывает на то, что линейная модель не способна уловить большую часть закономерностей в данных.
MAE = 262.15:
Средняя абсолютная ошибка равна 262.15 единиц. В среднем модель ошибается на ≈262.15 единиц при прогнозировании.
MSE = 181735.92:
Среднеквадратичная ошибка относительно велика, что говорит о наличии существенных ошибок прогнозирования, особенно если в данных присутствуют выбросы (так как большие ошибки сильно влияют на MSE).
Вывод: Линейная регрессия показывает слабое качество предсказания, что может быть связано либо с недостаточной линейностью зависимости между признаками и целевой переменной, либо с тем, что важные факторы не включены в модель.

2. Random Forest

R² = 0.410:
Модель объясняет около 41% дисперсии целевой переменной. Это значительное улучшение по сравнению с линейной регрессией, что говорит о лучшей способности модели улавливать сложные зависимости.
MAE = 205.98:
Средняя абсолютная ошибка снизилась до 205.98 единиц, что свидетельствует о более точных прогнозах.
MSE = 116945.44:
Меньшее значение MSE по сравнению с линейной регрессией указывает на снижение серьезных ошибок.
Вывод: Модель случайного леса способна лучше моделировать закономерности данных, вероятно, за счет использования ансамбля деревьев, который учитывает нелинейные зависимости и взаимодействия между признаками.

3. XGBoost

R² = 0.278:
Модель объясняет около 27.8% дисперсии зависимой переменной, что выше, чем у линейной регрессии, но ниже, чем у случайного леса.
MAE = 214.46:
Средняя абсолютная ошибка немного больше, чем у случайного леса, что говорит о менее точных прогнозах.
MSE = 143219.01:
Значение MSE также выше, чем в случае случайного леса, что может указывать на наличие нескольких больших ошибок.
Вывод: XGBoost проявляет себя лучше, чем простая линейная регрессия, но в данном случае его показатели уступают случайному лесу. Возможно, параметры модели или особенности данных требуют дополнительного тюнинга.

4. LightGBM

R² = 0.382:
Объясняется около 38.2% дисперсии зависимой переменной, что сопоставимо с результатами XGBoost и немного ниже случайного леса.
MAE = 204.79:
Средняя абсолютная ошибка чуть меньше, чем у случайного леса, что свидетельствует о хорошей точности предсказаний на "средних" ошибках.
MSE = 122551.46:
Значение MSE немного выше, чем у случайного леса, что может говорить о наличии некоторых выбросов или больших ошибок, влияющих на этот показатель.
Вывод: Модель LightGBM показывает сравнимую с Random Forest способность предсказания. Она демонстрирует схожие метрики точности — немного уступая по R², но показывая чуть лучшие MAE. Это указывает на хорошее качество модели, особенно в плане средней ошибки.

Общая интерпретация

Сравнение моделей по R²:

Random Forest (R²=0.410) показывает наилучший результат с точки зрения объяснения дисперсии целевой переменной, что означает, что 
               данная модель лучше учитывает взаимосвязи между признаками.
LightGBM (R²=0.382) находится немного позади Random Forest, а XGBoost (R²=0.278) уступает обеим ансамблевым моделям.
Линейная регрессия (R²=0.084) дает крайне низкое значение, что свидетельствует о слабой аппроксимации зависимостей.
Ошибка предсказания (MAE и MSE):

Меньшие значения MAE и MSE для Random Forest, XGBoost и LightGBM относительно линейной регрессии указывают на 
снижение средней абсолютной и квадратичной ошибки, что свидетельствует о более точном прогнозировании.
Особое внимание стоит уделить тому, что MSE чувствителен к большим ошибкам (из-за квадратичного возрастания вклада выбросов). 
Случайный лес демонстрирует наиболее низкие значения MSE, что доказывает его устойчивость к большим ошибкам.

Общий вывод:
Ансамблевые методы (случайный лес, XGBoost, LightGBM) показывают лучшие результаты по сравнению с линейной регрессией. Это говорит о том, что данные, скорее всего, имеют нелинейные или более сложные взаимосвязи, которые проще уловить моделям, объединяющим результаты нескольких базовых алгоритмов. 
При этом, случайный лес в данном случае выглядит лучшим выбором с точки зрения объяснения дисперсии и минимизации крупных ошибок, хотя разница между ним и LightGBM незначительная. Возможно, дальнейшая оптимизация и настройка гиперпараметров могли бы улучшить показатели даже более.

SyntaxError: invalid character '²' (U+00B2) (2460090823.py, line 3)