# Регрессия: прогноз SI

Прямая регрессия значения SI (индекса избирательности), исключая признаки IC50 и CC50.

In [1]:
import pandas as pd
import numpy as np
import re
import optuna

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

# Загрузка данных
df = pd.read_csv('/Users/rem/МИФИ/курсовая/dataset.csv')

# Подготовка данных: исключаем столбцы с целевыми переменными
features = [col for col in df.columns if col not in ["IC50, mM", "CC50, mM", "SI"]]
X = df[features]
y = df["SI"]

# Разбиение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Для линейной регрессии выполняем масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Для моделей на основе градиентного бустинга (LightGBM) необходимо очистить имена признаков от специальных символов
def clean_column(name):
    return re.sub(r'[^\w]', '_', name)

X_train_clean = X_train.rename(columns=lambda col: clean_column(col))
X_test_clean = X_test.rename(columns=lambda col: clean_column(col))

# ------------------------- Оптимизация гиперпараметров с использованием Optuna -------------------------

# 1. RandomForestRegressor
def objective_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=50)
    # Используем категориальный выбор: либо не ограничиваем глубину (None), либо ограничиваем значением 10
    max_depth = trial.suggest_categorical("max_depth", [None, 10])
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    return scores.mean()

study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(objective_rf, n_trials=20)
print("Random Forest best params:", study_rf.best_trial.params)
print("Random Forest best CV R2:", study_rf.best_trial.value)

# 2. XGBoost
def objective_xgb(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=50)
    max_depth = trial.suggest_categorical("max_depth", [3, 6])
    model = XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42,
                         use_label_encoder=False, eval_metric='rmse')
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    return scores.mean()

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=20)
print("XGBoost best params:", study_xgb.best_trial.params)
print("XGBoost best CV R2:", study_xgb.best_trial.value)

# 3. LightGBM
def objective_lgb(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=50)
    max_depth = trial.suggest_categorical("max_depth", [-1, 10])
    model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    # Для LightGBM используем очищенные признаки
    scores = cross_val_score(model, X_train_clean, y_train, cv=5, scoring='r2')
    return scores.mean()

study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=20)
print("LightGBM best params:", study_lgb.best_trial.params)
print("LightGBM best CV R2:", study_lgb.best_trial.value)

# ------------------------- Обучение финальных моделей с лучшими гиперпараметрами -------------------------

# Линейная регрессия (без оптимизации)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

# Random Forest с лучшими параметрами
best_rf = RandomForestRegressor(**study_rf.best_trial.params, random_state=42)
best_rf.fit(X_train, y_train)
y_pred_rf = best_rf.predict(X_test)

# XGBoost с лучшими параметрами
best_xgb = XGBRegressor(**study_xgb.best_trial.params, random_state=42,
                        use_label_encoder=False, eval_metric='rmse')
best_xgb.fit(X_train, y_train)
y_pred_xgb = best_xgb.predict(X_test)

# LightGBM с лучшими параметрами (используем очищенные признаки)
best_lgb = LGBMRegressor(**study_lgb.best_trial.params, random_state=42)
best_lgb.fit(X_train_clean, y_train)
y_pred_lgb = best_lgb.predict(X_test_clean)

# ------------------------- Вывод результатов -------------------------
models = {
    "Linear Regression": y_pred_lr,
    "Random Forest": y_pred_rf,
    "XGBoost": y_pred_xgb,
    "LightGBM": y_pred_lgb
}

for name, preds in models.items():
    print(f"{name}: R2={r2_score(y_test, preds):.3f}, "
          f"MAE={mean_absolute_error(y_test, preds):.2f}, "
          f"MSE={mean_squared_error(y_test, preds):.2f}")

[I 2025-06-03 00:28:22,859] A new study created in memory with name: no-name-56ba7985-5e7b-46de-b6cd-8dae78d75401
[I 2025-06-03 00:29:20,402] Trial 0 finished with value: -24.88233781865233 and parameters: {'n_estimators': 300, 'max_depth': 10}. Best is trial 0 with value: -24.88233781865233.
[I 2025-06-03 00:29:58,440] Trial 1 finished with value: -21.89134733746128 and parameters: {'n_estimators': 100, 'max_depth': None}. Best is trial 1 with value: -21.89134733746128.
[I 2025-06-03 00:31:17,292] Trial 2 finished with value: -24.751664358537987 and parameters: {'n_estimators': 200, 'max_depth': None}. Best is trial 1 with value: -21.89134733746128.
[I 2025-06-03 00:31:35,818] Trial 3 finished with value: -21.892920027693098 and parameters: {'n_estimators': 100, 'max_depth': 10}. Best is trial 1 with value: -21.89134733746128.
[I 2025-06-03 00:32:32,363] Trial 4 finished with value: -24.88233781865233 and parameters: {'n_estimators': 300, 'max_depth': 10}. Best is trial 1 with value: 

Random Forest best params: {'n_estimators': 100, 'max_depth': None}
Random Forest best CV R2: -21.89134733746128


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-06-03 00:46:15,705] Trial 0 finished with value: -13.466219197024506 and parameters: {'n_estimators': 150, 'max_depth': 6}. Best is trial 0 with value: -13.466219197024506.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.up

XGBoost best params: {'n_estimators': 100, 'max_depth': 3}
XGBoost best CV R2: -12.966392464455996
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002904 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] 

[I 2025-06-03 00:48:29,743] Trial 0 finished with value: -7.398788747932552 and parameters: {'n_estimators': 150, 'max_depth': -1}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004031 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:31,828] Trial 1 finished with value: -7.461524604517552 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:34,540] Trial 2 finished with value: -7.398788747932552 and parameters: {'n_estimators': 150, 'max_depth': -1}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002381 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002813 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:36,179] Trial 3 finished with value: -7.461524604517552 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002455 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:37,755] Trial 4 finished with value: -7.461524604517552 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002989 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:39,473] Trial 5 finished with value: -7.461524604517552 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002564 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train 

[I 2025-06-03 00:48:43,141] Trial 6 finished with value: -9.442627590855036 and parameters: {'n_estimators': 300, 'max_depth': 10}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002909 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train 

[I 2025-06-03 00:48:45,826] Trial 7 finished with value: -8.93995037810917 and parameters: {'n_estimators': 250, 'max_depth': 10}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:48,504] Trial 8 finished with value: -8.21753807569739 and parameters: {'n_estimators': 200, 'max_depth': -1}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:50,370] Trial 9 finished with value: -7.461524604517552 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 0 with value: -7.398788747932552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079820 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:52,391] Trial 10 finished with value: -6.287512305620599 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train 

[I 2025-06-03 00:48:54,428] Trial 11 finished with value: -6.287512305620599 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006314 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002544 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002859 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:56,316] Trial 12 finished with value: -6.287512305620599 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003560 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003623 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002994 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:48:58,401] Trial 13 finished with value: -6.287512305620599 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002870 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:49:00,350] Trial 14 finished with value: -6.287512305620599 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005988 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:49:04,009] Trial 15 finished with value: -8.21753807569739 and parameters: {'n_estimators': 200, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002784 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002819 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:49:05,964] Trial 16 finished with value: -6.287512305620599 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004856 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:49:11,285] Trial 17 finished with value: -8.862037164736417 and parameters: {'n_estimators': 250, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train set: 618, number of used features: 166
[LightGBM] [Info] Start t

[I 2025-06-03 00:49:13,564] Trial 18 finished with value: -6.287512305620599 and parameters: {'n_estimators': 100, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004013 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14711
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 165
[LightGBM] [Info] Start training from score 79.023694
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14782
[LightGBM] [Info] Number of data points in the train set: 617, number of used features: 164
[LightGBM] [Info] Start training from score 99.169513
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14853
[LightGBM] [Info] Number of data points in the train 

[I 2025-06-03 00:49:18,712] Trial 19 finished with value: -8.862037164736417 and parameters: {'n_estimators': 250, 'max_depth': -1}. Best is trial 10 with value: -6.287512305620599.


LightGBM best params: {'n_estimators': 100, 'max_depth': -1}
LightGBM best CV R2: -6.287512305620599


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18251
[LightGBM] [Info] Number of data points in the train set: 772, number of used features: 169
[LightGBM] [Info] Start training from score 84.027647
Linear Regression: R2=-10.217, MAE=231.89, MSE=222394.58
Random Forest: R2=-5.582, MAE=82.82, MSE=130490.29
XGBoost: R2=-7.542, MAE=99.40, MSE=169362.99
LightGBM: R2=-7.570, MAE=118.45, MSE=169916.77


1. Linear Regression: R² = -10.217, MAE = 231.89, MSE = 222394.58

R² = -10.217:
Очень отрицательное значение говорит о том, что модель предсказывает настолько плохо, что ошибка предсказаний значительно превышает вариативность целевого признака (то есть модель работает хуже, чем простое предсказание среднего значения).

MAE = 231.89:
В среднем предсказания отклоняются от истинного значения на 231.89 единиц, что достаточно высоко.

MSE = 222394.58:
Сильное возрастание ошибки при возведении отклонений в квадрат подтверждает наличие крупных ошибок в предсказаниях.

Вывод: Линейная регрессия явно не справляется с задачей: вероятно, зависимость между признаками и целевой переменной является сложной или нелинейной, а простой линейный подход не способен уловить эту структуру.

2. Random Forest: R² = -5.582, MAE = 82.82, MSE = 130490.29

R² = -5.582:
Несмотря на отрицательное значение, оно менее отрицательно, чем у линейной регрессии, что указывает на некоторое улучшение. Однако по-прежнему модель работает ниже базового уровня (прогноз среднего значения).

MAE = 82.82:
Средняя ошибка уменьшилась до 82.82 единиц, что является значительным улучшением по сравнению с линейной регрессией.

MSE = 130490.29:
Также снижение MSE означает, что модель совокупно уменьшила ошибки, хотя остаются отдельные крупные ошибки.

Вывод: Метод случайного леса лучше справляется с задачей по сравнению с линейной регрессией, снижая как среднюю абсолютную ошибку, так и квадратичную ошибку. Однако отрицательный R² говорит о том, что даже этот подход не может объяснить достаточную часть дисперсии целевой переменной – предсказания все еще хуже, чем простое использование среднего значения.

3. XGBoost: R² = -7.542, MAE = 99.40, MSE = 169362.99

R² = -7.542:
Отрицательное значение R² вновь указывает на то, что модель не справляется с задачей настолько, чтобы превзойти базовый прогноз среднего значения, хотя по размеру ошибки немного лучше, чем линейная регрессия, но хуже, чем у случайного леса.

MAE = 99.40:
Средняя абсолютная ошибка немного выше, чем у Random Forest, что свидетельствует о чуть менее точных предсказаниях на среднем уровне.

MSE = 169362.99:
Более высокий показатель MSE предполагает, что предсказания содержат более крупные отклонения (выбросы или отдельные сильно ошибочные предсказания).

Вывод: Модель XGBoost улучшает качество предсказаний по сравнению с простой линейной моделью, однако её ошибки остаются достаточно высокими, а отрицательный R² подчеркивает, что выбранная конфигурация модели не способна адекватно уловить целевую зависимость.

4. LightGBM: R² = -7.570, MAE = 118.45, MSE = 169916.77

R² = -7.570:
Значение близко к XGBoost, что говорит о схожей производительности по объяснению дисперсии в данных. Отрицательный показатель означает, что модель работает хуже простого базового прогноза.

MAE = 118.45:
Средняя абсолютная ошибка несколько выше, чем у XGBoost и Random Forest, что может свидетельствовать о менее точных средних предсказаниях.

MSE = 169916.77:
Показатель квадратичной ошибки почти аналогичен XGBoost – остаются отдельные более крупные ошибки.

Вывод: LightGBM демонстрирует результаты, сопоставимые с XGBoost. Хотя она снижает абсолютные и квадратичные ошибки по сравнению с линейной регрессией, отрицательный коэффициент детерминации остаётся индикатором того, что модель не умеет уловить основную структуру данных.

Общая интерпретация результатов

Отрицательные значения R² для всех моделей:
Это указывает на то, что ни одна из моделей не справилась с задачей так, чтобы объяснить дисперсию целевой переменной лучше, чем базовый метод, который всегда предсказывает среднее значение.
Возможные причины:


Сложные, возможно нелинейные зависимости, которые даже ансамблевые методы не смогли уловить, возможно, из-за неподходящего выбора гиперпараметров или проблемы в разбиении данных.
Сравнение моделей по MAE и MSE:

Random Forest показывает заметное снижение ошибки (MAE ≈ 82.82), что говорит о том, что модель предсказывает ближе к истинным значениям по абсолютной величине.
XGBoost и LightGBM имеют MAE чуть выше, а также более высокий MSE, что указывает на присутствие нескольких случаев с высокими ошибками предсказаний.

Заключение

Все четыре модели демонстрируют недостаточную способность предсказывать целевую переменную, поскольку их ошибки выше, чем у базового прогноза среднего значения (что отражается отрицательным R²). Из ансамблевых методов Random Forest показывает наименьшую абсолютную ошибку (MAE), но даже она не достигает положительного коэффициента детерминации. Это сигнализирует о том, что необходимо более глубокое изучение данных, корректировка предобработки и, возможно, поиск других признаков или моделей, способных лучше уловить структуру исходных данных.

