In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    root_mean_squared_error,
    mean_absolute_percentage_error,
)
import optuna

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) /
                         (np.abs(y_true) + np.abs(y_pred)))

df = pd.read_excel("dataset_vrp/2014-2023_lags_EWM_targets_train.xlsx")

target_col = "log_target_next_year"

exclude_cols = [
    "delta_target",
    "delta_target_percent",
    "target_next_year",
    "region",
    "district",
    "year",
]

num_cols = [col for col in df.select_dtypes("number").columns if col not in (exclude_cols + [target_col])]
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

train_df = df[df["year"] < 2023].copy()
test_df = df[df["year"] == 2023].copy()

X_train = train_df.drop(columns=exclude_cols)
y_train = train_df[target_col]
X_test = test_df.drop(columns=exclude_cols)
y_test = test_df[target_col]

cat_features = []


def objective(trial):
    params = {
        "loss_function": "MAE",
        "eval_metric": "MAE",
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 5.0),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": False,
    }

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    test_pool = Pool(X_test, y_test, cat_features=cat_features)

    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=test_pool, verbose=False)

    pred = model.predict(X_test)

    return mean_absolute_error(y_test, pred)


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Лучшие параметры: {best_params}")

best_params["loss_function"] = "MAE"
best_params["eval_metric"] = "MAE"
best_params["verbose"] = False

model = CatBoostRegressor(**best_params)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

model.fit(train_pool, eval_set=test_pool, verbose=False)

pred_log = model.predict(X_test)

y_test_real = np.exp(y_test)
pred_real = np.exp(pred_log)

metrics = {
    "MAE_log": mean_absolute_error(y_test, pred_log),
    "MSE_log": mean_squared_error(y_test, pred_log),
    "R2_log": r2_score(y_test, pred_log),
    "MAE_real": mean_absolute_error(y_test_real, pred_real),
    "MSE_real": mean_squared_error(y_test_real, pred_real),
    "RMSE_real": root_mean_squared_error(y_test_real, pred_real),
    "MAPE_real": mean_absolute_percentage_error(y_test_real, pred_real),
    "SMAPE_real": smape(y_test_real, pred_real),
}

print("\nМетрики 2023:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

[I 2025-11-29 16:28:17,354] A new study created in memory with name: no-name-4856de8c-4f66-4166-9153-daed2964bb1d
[I 2025-11-29 16:28:39,185] Trial 0 finished with value: 0.12413535847621832 and parameters: {'iterations': 911, 'learning_rate': 0.05023719615498771, 'depth': 10, 'l2_leaf_reg': 7.630227902069575, 'bagging_temperature': 1.4965097110463532, 'random_strength': 2.6232142968854917, 'border_count': 70}. Best is trial 0 with value: 0.12413535847621832.
[I 2025-11-29 16:28:39,618] Trial 1 finished with value: 0.1731119780504713 and parameters: {'iterations': 135, 'learning_rate': 0.04112427617427776, 'depth': 5, 'l2_leaf_reg': 3.002389488382696, 'bagging_temperature': 2.895335239965997, 'random_strength': 3.4225801311395676, 'border_count': 244}. Best is trial 0 with value: 0.12413535847621832.
[I 2025-11-29 16:29:48,193] Trial 2 finished with value: 0.1629081248285871 and parameters: {'iterations': 897, 'learning_rate': 0.14160894102776717, 'depth': 10, 'l2_leaf_reg': 7.99408147

Лучшие параметры: {'iterations': 555, 'learning_rate': 0.050880229095714755, 'depth': 4, 'l2_leaf_reg': 2.3178497919472854, 'bagging_temperature': 1.7319506039823145, 'random_strength': 0.10708541388236552, 'border_count': 45}

Метрики 2023:
MAE_log: 0.0630
MSE_log: 0.0098
R2_log: 0.9927
MAE_real: 81596.9731
MSE_real: 146717196216.8564
RMSE_real: 383036.8079
MAPE_real: 0.0620
SMAPE_real: 6.2745


: 