In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

import optuna

df = pd.read_excel("dataset_vrp/2014-2023_lags_EWM_targets_train.xlsx")

target_col = "target_next_year"

exclude_cols = [
    "year", "district", "region",
    "target_next_year", "delta_target", 
    "delta_target_percent", "log_target_next_year"
]

feature_cols = [c for c in df.columns if c not in exclude_cols]

train_df = df[df["year"] < 2023].copy()
test_df  = df[df["year"] == 2023].copy()

X_train = train_df[feature_cols]
y_train = train_df[target_col]

X_test = test_df[feature_cols]
y_test = test_df[target_col]


def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) /
                         (np.abs(y_true) + np.abs(y_pred)))


def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 600),
        "max_depth": trial.suggest_int("max_depth", 4, 13),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical(
            "max_features", ["sqrt", "log2", 0.5, 0.7, 1.0]
        ),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False])
    }

    model = RandomForestRegressor(random_state=42, **params)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    return mean_absolute_error(y_test, pred)


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Лучшие параметры: {best_params}")


final_model = RandomForestRegressor(
    random_state=42,
    **best_params
)
final_model.fit(X_train, y_train)

pred_test = final_model.predict(X_test)

y_test_log = np.log1p(y_test)
pred_log   = np.log1p(pred_test)

MAE_log = mean_absolute_error(y_test_log, pred_log)
MSE_log = mean_squared_error(y_test_log, pred_log)
R2_log  = r2_score(y_test_log, pred_log)

# 2023 метрики
MAE_real  = mean_absolute_error(y_test, pred_test)
MSE_real  = mean_squared_error(y_test, pred_test)
RMSE_real = np.sqrt(MSE_real)
MAPE_real = mean_absolute_percentage_error(y_test, pred_test)
SMAPE_real = smape(y_test, pred_test)


metrics = {
    "MAE_log": MAE_log,
    "MSE_log": MSE_log,
    "R2_log": R2_log,

    "MAE_real": MAE_real,
    "MSE_real": MSE_real,
    "RMSE_real": RMSE_real,
    "MAPE_real": MAPE_real,
    "SMAPE_real": SMAPE_real
}

print("\nМетрики 2023:")
for name, value in metrics.items():
    print(f"{name}: {value:.2f}")


[I 2025-11-28 09:16:48,074] A new study created in memory with name: no-name-84216093-03ac-4a48-a854-8f4a0b11f2bc
[I 2025-11-28 09:16:48,508] Trial 0 finished with value: 136879.2051273537 and parameters: {'n_estimators': 136, 'max_depth': 12, 'min_samples_split': 4, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 136879.2051273537.
[I 2025-11-28 09:16:52,785] Trial 1 finished with value: 97305.13512362762 and parameters: {'n_estimators': 308, 'max_depth': 7, 'min_samples_split': 13, 'min_samples_leaf': 7, 'max_features': 0.7, 'bootstrap': True}. Best is trial 1 with value: 97305.13512362762.
[I 2025-11-28 09:16:53,110] Trial 2 finished with value: 137257.074101282 and parameters: {'n_estimators': 110, 'max_depth': 5, 'min_samples_split': 12, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 1 with value: 97305.13512362762.
[I 2025-11-28 09:16:56,842] Trial 3 finished with value: 78769.0634722166 and paramet

Лучшие параметры: {'n_estimators': 216, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.7, 'bootstrap': True}

Метрики 2023:
MAE_log: 0.12
MSE_log: 0.03
R2_log: 0.98
MAE_real: 68463.62
MSE_real: 90063669566.07
RMSE_real: 300106.10
MAPE_real: 0.13
SMAPE_real: 12.31
