In [2]:
import pandas as pd
import numpy as np
import optuna

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

df = pd.read_excel("dataset_vrp/2014-2023_lags_EWM_targets_train.xlsx")

TARGET = "target_next_year"

df.dropna(inplace=True)

exclude_cols = [
    'year', 'district', 'region',
    'target_next_year', 'delta_target', 'delta_target_percent', 'log_target_next_year'
]

feature_cols = [
    col for col in df.columns
    if col not in exclude_cols
]

X = df[feature_cols]
y = df[TARGET]

train = df[df["year"] < 2023]
test = df[df["year"] == 2023]

X_train = train[feature_cols]
y_train = train[TARGET]

X_test = test[feature_cols]
y_test = test[TARGET]

def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3),
        "max_depth": trial.suggest_int("max_depth", 2, 13),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "max_features": trial.suggest_float("max_features", 0.6, 1.0),
    }

    model = Pipeline([
        ("scale", StandardScaler()),
        ("gbr", GradientBoostingRegressor(**params))
    ])

    tscv = TimeSeriesSplit(n_splits=5)

    cv_score = cross_val_score(
        model, X_train, y_train,
        cv=tscv,
        scoring="neg_mean_squared_error",
        n_jobs=-1
    )

    return -np.mean(cv_score)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

print(f"Лучшие параметры: {study.best_params}")

best_params = study.best_params

final_model = Pipeline([
    ("scale", StandardScaler()),
    ("gbr", GradientBoostingRegressor(**best_params))
])

final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)

def smape(y_test, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_test) / (np.abs(y_test) + np.abs(y_pred) + 1e-9))

def mae_log(y_test, y_pred):
    return mean_absolute_error(np.log1p(y_test), np.log1p(y_pred))

def mse_log(y_test, y_pred):
    return mean_squared_error(np.log1p(y_test), np.log1p(y_pred))

def r2_log(y_test, y_pred):
    return r2_score(np.log1p(y_test), np.log1p(y_pred))

metrics = {
    "RMSE_real": root_mean_squared_error(y_test, y_pred),
    "MSE_real": mean_squared_error(y_test, y_pred),
    "MAE_real": mean_absolute_error(y_test, y_pred),
    "R2_real": r2_score(y_test, y_pred),
    "MAPE_real": mean_absolute_percentage_error(y_test, y_pred),
    "SMAPE_real": smape(y_test, y_pred),

    "MAE_log": mae_log(y_test, y_pred),
    "MSE_log": mse_log(y_test, y_pred),
    "R2_log": r2_log(y_test, y_pred)
}

print(f"\nРезультаты метрик:")
for metric, result in metrics.items():
    print(f"{metric}: {result}")


[I 2025-11-28 10:27:28,428] A new study created in memory with name: no-name-d2858b5f-e6f1-4b92-b678-999cb4af02c3
[I 2025-11-28 10:27:46,248] Trial 0 finished with value: 103844848801.23276 and parameters: {'n_estimators': 465, 'learning_rate': 0.13658994357963178, 'max_depth': 8, 'min_samples_split': 12, 'min_samples_leaf': 5, 'subsample': 0.6313564994216053, 'max_features': 0.9062767553762108}. Best is trial 0 with value: 103844848801.23276.
[I 2025-11-28 10:27:50,725] Trial 1 finished with value: 109400993969.78531 and parameters: {'n_estimators': 132, 'learning_rate': 0.11849483668967668, 'max_depth': 4, 'min_samples_split': 17, 'min_samples_leaf': 5, 'subsample': 0.628456195752233, 'max_features': 0.959454454973019}. Best is trial 0 with value: 103844848801.23276.
[I 2025-11-28 10:27:56,599] Trial 2 finished with value: 119917630344.11621 and parameters: {'n_estimators': 132, 'learning_rate': 0.226341491318694, 'max_depth': 9, 'min_samples_split': 14, 'min_samples_leaf': 8, 'subsa

Лучшие параметры: {'n_estimators': 737, 'learning_rate': 0.1628512321826614, 'max_depth': 5, 'min_samples_split': 17, 'min_samples_leaf': 1, 'subsample': 0.8597690483041818, 'max_features': 0.8031917392177738}

Результаты метрик:
RMSE_real: 131875.35459558945
MSE_real: 17391109149.71246
MAE_real: 56619.028338125274
R2_real: 0.9822639659309559
MAPE_real: 0.16956005509114294
SMAPE_real: 14.394008255260953
MAE_log: 0.14613481853073257
MSE_log: 0.04924653252148095
R2_log: 0.9635461657207403
