In [None]:
import pandas as pd
import numpy as np
import optuna

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error, mean_absolute_percentage_error
)

from xgboost import XGBRegressor

df = pd.read_excel("dataset_vrp/2014-2023_lags_EWM_targets_train.xlsx")

test_year = 2023

train_df = df[df["year"] < test_year].copy()
test_df  = df[df["year"] == test_year].copy()

target_col = "target_next_year"

exclude_cols = [
    "year", "district", "region",
    "target_next_year",
    "delta_target", "delta_target_percent",
    "log_target_next_year"
]

features = [c for c in df.columns if c not in exclude_cols]

X_train, y_train = train_df[features], train_df[target_col]
X_test,  y_test  = test_df[features],  test_df[target_col]


def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 900),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "objective": "reg:squarederror",
        "tree_method": "hist",
        "random_state": 42,
    }

    model = XGBRegressor(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)

    return mae 

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

best_params = study.best_params
print(f"\nЛучшие параметры: {best_params}")

model = XGBRegressor(
    **best_params,
    objective="reg:squarederror",
    tree_method="hist",
    random_state=42
)
model.fit(X_train, y_train)

pred_real = model.predict(X_test)

y_test_log = np.log1p(y_test)
pred_log = np.log1p(pred_real)

def smape(a, f):
    return 100 * np.mean(2 * np.abs(f - a) / (np.abs(a) + np.abs(f)))

MAE_log = mean_absolute_error(y_test_log, pred_log)
MSE_log = mean_squared_error(y_test_log, pred_log)
R2_log  = r2_score(y_test_log, pred_log)

MAE_real = mean_absolute_error(y_test, pred_real)
MSE_real = mean_squared_error(y_test, pred_real)
RMSE_real = root_mean_squared_error(y_test, pred_real)
MAPE_real = mean_absolute_percentage_error(y_test, pred_real)
SMAPE_real = smape(y_test, pred_real)


metrics = {
    "MAE_log": MAE_log,
    "MSE_log": MSE_log,
    "R2_log": R2_log,

    "MAE_real": MAE_real,
    "MSE_real": MSE_real,
    "RMSE_real": RMSE_real,
    "MAPE_real": MAPE_real,
    "SMAPE_real": SMAPE_real
}

print("\nМетрики 2023:")
for name, value in metrics.items():
    print(f"{name}: {value}")


[I 2025-11-28 09:43:58,696] A new study created in memory with name: no-name-14b8523c-ba1a-409f-9eaa-c0959021eeb1
[I 2025-11-28 09:44:00,261] Trial 0 finished with value: 83601.3022147342 and parameters: {'n_estimators': 896, 'max_depth': 4, 'learning_rate': 0.07799497637802903, 'subsample': 0.769048557637041, 'colsample_bytree': 0.856046156054606, 'min_child_weight': 9}. Best is trial 0 with value: 83601.3022147342.
[I 2025-11-28 09:44:02,585] Trial 1 finished with value: 77769.86399939658 and parameters: {'n_estimators': 533, 'max_depth': 14, 'learning_rate': 0.1768563624819698, 'subsample': 0.8650135057785586, 'colsample_bytree': 0.8386000133840602, 'min_child_weight': 8}. Best is trial 1 with value: 77769.86399939658.
[I 2025-11-28 09:44:05,231] Trial 2 finished with value: 88636.68135369793 and parameters: {'n_estimators': 709, 'max_depth': 14, 'learning_rate': 0.15422726603236853, 'subsample': 0.8279508364454803, 'colsample_bytree': 0.7583213820886145, 'min_child_weight': 10}. Be


Лучшие параметры: {'n_estimators': 716, 'max_depth': 5, 'learning_rate': 0.17129797787380185, 'subsample': 0.7042492728600979, 'colsample_bytree': 0.8231066254848131, 'min_child_weight': 5}

Метрики 2023:
MAE_log: 0.14847598086269756
MSE_log: 0.0438501490810937
R2_log: 0.9675407386900637
MAE_real: 53295.69724935345
MSE_real: 20228007217.75302
RMSE_real: 142225.19895487235
MAPE_real: 0.16445750117959476
SMAPE_real: 14.688235656157426
