In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    root_mean_squared_error,
    mean_absolute_percentage_error,
)
import optuna

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) /
                         (np.abs(y_true) + np.abs(y_pred)))

df = pd.read_excel("dataset_vrp/2014-2023_lags_EWM_targets_train.xlsx")

target_col = "log_target_next_year"

exclude_cols = ["delta_target", "delta_target_percent", "target_next_year", "region", "year", "district"]

num_cols = [col for col in df.select_dtypes("number").columns if col not in (exclude_cols + [target_col])]
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

train = df[df["year"] < 2023].copy()
test = df[df["year"] == 2023].copy()

X_train = train.drop(columns=exclude_cols)
y_train = train[target_col]
X_test = test.drop(columns=exclude_cols)
y_test = test[target_col]

y_test_real = np.exp(y_test)


def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1500),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3),
        "max_depth": trial.suggest_int("max_depth", 2, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    }

    model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    mae_log = mean_absolute_error(y_test, pred)
    return mae_log


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Лучшие параметры: {best_params}")

model = GradientBoostingRegressor(**best_params)
model.fit(X_train, y_train)
 
pred_log = model.predict(X_test)
pred_real = np.exp(pred_log)

metrics = {
    "MAE_log": mean_absolute_error(y_test, pred_log),
    "MSE_log": mean_squared_error(y_test, pred_log),
    "R2_log": r2_score(y_test, pred_log),

    "MAE_real": mean_absolute_error(y_test_real, pred_real),
    "MSE_real": mean_squared_error(y_test_real, pred_real),
    "RMSE_real": root_mean_squared_error(y_test_real, pred_real),
    "MAPE_real": mean_absolute_percentage_error(y_test_real, pred_real),
    "SMAPE_real": smape(y_test_real, pred_real)
}

print("\nМетрики 2023:")
for name, value in metrics.items():
    print(f"{name}: {value:.4f}")

[I 2025-11-29 16:12:08,811] A new study created in memory with name: no-name-0acfdb5c-813e-41db-bfb0-51c3c3f02e74
[I 2025-11-29 16:12:21,611] Trial 0 finished with value: 0.022901899210379422 and parameters: {'n_estimators': 1045, 'learning_rate': 0.17343478423182643, 'max_depth': 4, 'subsample': 0.5936678336605041, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.022901899210379422.
[I 2025-11-29 16:12:41,011] Trial 1 finished with value: 0.009237158085447847 and parameters: {'n_estimators': 1437, 'learning_rate': 0.19830056260367146, 'max_depth': 6, 'subsample': 0.9928622985567355, 'min_samples_split': 15, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.009237158085447847.
[I 2025-11-29 16:12:45,941] Trial 2 finished with value: 0.00994253906590763 and parameters: {'n_estimators': 320, 'learning_rate': 0.07416614191819826, 'max_depth': 4, 'subsample': 0.7393881936986288, 'min_samples_split': 19, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.

Лучшие параметры: {'n_estimators': 599, 'learning_rate': 0.02226634360873345, 'max_depth': 4, 'subsample': 0.9290261771114127, 'min_samples_split': 3, 'min_samples_leaf': 2}

Метрики 2023:
MAE_log: 0.0071
MSE_log: 0.0004
R2_log: 0.9997
MAE_real: 17852.3617
MSE_real: 14038586641.0500
RMSE_real: 118484.5418
MAPE_real: 0.0070
SMAPE_real: 0.7092


: 