In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    mean_absolute_error, root_mean_squared_error,
    r2_score, mean_absolute_percentage_error
)
import xgboost as xgb
import optuna

df = pd.read_csv("dataset_vrp/ready_dataset.csv", index_col=0)
df = df.dropna().reset_index(drop=True)

cat_features = ['Округ', 'Регион']

label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

features = [
    'Округ', 'Регион',
    'Инвестиции в основной капитал в млн.руб',
    'Численность населения',
    'Уровень безработицы от 15 д 72 лет, %',
    'Средняя ЗП в руб',
    'Оборот розничной торговли, млн.руб.',
    'Денежные доходы - всего, млн.руб.',
    'Внутренние текущие затраты на научные исследования и разработки, млн.руб',
    'Среднегодовая ставка ЦБ, %',
    'Годовая инфляция, %',
    'Курс Доллара к Рублю, ₽',
    'Цена фьючерса на нефть Brent, $',
    'Цена фьючерса на золото, $'
]

X = df[features]
y = df['ВРП']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "mae",
        "tree_method": "hist",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True),
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 10),
        "random_state": 42
    }

    model = xgb.XGBRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )
    preds = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds)
    return mae

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("\nЛучшие параметры:")
print(study.best_params)

best_params = study.best_params
best_params.update({
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "random_state": 42
})

best_model = xgb.XGBRegressor(**best_params)
best_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=100)


[I 2025-11-16 12:38:57,909] A new study created in memory with name: no-name-38a220c9-2b82-459c-969b-bdb4162894a4


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-11-16 12:38:58,457] Trial 0 finished with value: 264157.5364992559 and parameters: {'learning_rate': 0.06502357198462712, 'max_depth': 6, 'n_estimators': 682, 'subsample': 0.9404982143216674, 'colsample_bytree': 0.937668659658999, 'lambda': 0.001550984743906804, 'alpha': 0.0069650993246101025, 'min_child_weight': 9.847547895246464}. Best is trial 0 with value: 264157.5364992559.
[I 2025-11-16 12:38:58,833] Trial 1 finished with value: 327554.1697767857 and parameters: {'learning_rate': 0.013615514200435662, 'max_depth': 4, 'n_estimators': 545, 'subsample': 0.6882403608968239, 'colsample_bytree': 0.9688708857008639, 'lambda': 0.0015000903649535569, 'alpha': 0.002304414078442612, 'min_child_weight': 5.222317122139033}. Best is trial 0 with value: 264157.5364992559.
[I 2025-11-16 12:38:59,589] Trial 2 finished with value: 223613.2917113095 and parameters: {'learning_rate': 0.06395955393251057, 'max_depth': 10, 'n_estimators': 719, 'subsample': 0.6526178742977615, 'colsample_bytree

In [6]:
y_pred = best_model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
mape = mean_absolute_percentage_error(y_valid, y_pred) * 100
r2 = r2_score(y_valid, y_pred)
rmse = root_mean_squared_error(y_valid, y_pred)

print("\nМетрики на валидации:")
print(f"R²: {r2:.4f}")
print(f"MAE: {mae:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"MAPE: {mape:.2f} %")

df_2024 = pd.read_csv("dataset_vrp/2024.csv")
df_2024 = df_2024.dropna(subset=features).reset_index(drop=True)

for col in cat_features:
    df_2024[col] = label_encoders[col].transform(df_2024[col].astype(str))

X_2024 = df_2024[features]
y_true_2024 = df_2024["ВРП"]

y_pred_2024 = best_model.predict(X_2024)

mae_2024 = mean_absolute_error(y_true_2024, y_pred_2024)
mape_2024 = mean_absolute_percentage_error(y_true_2024, y_pred_2024) * 100
r2_2024 = r2_score(y_true_2024, y_pred_2024)
rmse_2024 = root_mean_squared_error(y_true_2024, y_pred_2024)

print("\nМетрики на 2024:")
print(f"R²: {r2_2024:.4f}")
print(f"MAE: {mae_2024:,.2f}")
print(f"RMSE: {rmse_2024:,.2f}")
print(f"MAPE: {mape_2024:.2f}%")


Метрики на валидации:
R²: 0.9862
MAE: 199,079.07
RMSE: 464,165.45
MAPE: 26.55 %

Метрики на 2024:
R²: 0.9946
MAE: 91,144.50
RMSE: 225,155.84
MAPE: 14.16%


## GradientBoosting

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import (
    mean_absolute_error, root_mean_squared_error,
    r2_score, mean_absolute_percentage_error
)
import optuna

df = pd.read_csv("dataset_vrp/ready_dataset.csv", index_col=0)
df = df.dropna().reset_index(drop=True)

cat_features = ['Округ', 'Регион']
label_encoders = {}
for col in cat_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

features = [
    'Округ', 'Регион',
    'Инвестиции в основной капитал в млн.руб',
    'Численность населения',
    'Средняя ЗП в руб',
    'Оборот розничной торговли, млн.руб.',
    'Денежные доходы - всего, млн.руб.',
    'Внутренние текущие затраты на научные исследования и разработки, млн.руб',
    'Среднегодовая ставка ЦБ, %',
    'Годовая инфляция, %',
    'Уровень безработицы от 15 д 72 лет, %'
]

X = df[features]
y = df['ВРП']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 8),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 8),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "loss": "squared_error",
        "random_state": 42
    }

    model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds)
    return mae

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=200, show_progress_bar=True)

print("\nЛучшие параметры:")
print(study.best_params)


[I 2025-11-16 12:41:25,562] A new study created in memory with name: no-name-88cd6144-8c74-4426-a145-9a005926b8fc


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-11-16 12:41:26,294] Trial 0 finished with value: 223524.86969696893 and parameters: {'n_estimators': 1016, 'learning_rate': 0.0736386472268045, 'max_depth': 4, 'min_samples_split': 6, 'min_samples_leaf': 1, 'subsample': 0.8056831159456949, 'max_features': 'log2'}. Best is trial 0 with value: 223524.86969696893.
[I 2025-11-16 12:41:27,358] Trial 1 finished with value: 274671.2563153071 and parameters: {'n_estimators': 1056, 'learning_rate': 0.11873741461292919, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 4, 'subsample': 0.9743453298302184, 'max_features': 'log2'}. Best is trial 0 with value: 223524.86969696893.
[I 2025-11-16 12:41:28,112] Trial 2 finished with value: 319320.92043171386 and parameters: {'n_estimators': 523, 'learning_rate': 0.06013258488259772, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 3, 'subsample': 0.9368487323172419, 'max_features': None}. Best is trial 0 with value: 223524.86969696893.
[I 2025-11-16 12:41:28,876] Trial 3 fi

In [8]:
best_model = GradientBoostingRegressor(
    **study.best_params,
    loss="squared_error",
    random_state=42
)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
mape = mean_absolute_percentage_error(y_valid, y_pred) * 100
r2 = r2_score(y_valid, y_pred)
rmse = root_mean_squared_error(y_valid, y_pred)

print("\nМетрики на валидации:")
print(f"R²: {r2:.4f}")
print(f"MAE: {mae:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"MAPE: {mape:.2f} %")

df_2024 = pd.read_csv("dataset_vrp/2024.csv")
df_2024 = df_2024.dropna(subset=features).reset_index(drop=True)

for col in cat_features:
    df_2024[col] = label_encoders[col].transform(df_2024[col].astype(str))

X_2024 = df_2024[features]
y_true_2024 = df_2024["ВРП"]

y_pred_2024 = best_model.predict(X_2024)

mae_2024_g = mean_absolute_error(y_true_2024, y_pred_2024)
mape_2024_g = mean_absolute_percentage_error(y_true_2024, y_pred_2024) * 100
r2_2024_g = r2_score(y_true_2024, y_pred_2024)
rmse_2024_g = root_mean_squared_error(y_true_2024, y_pred_2024)

print("\nМетрики на 2024.csv:")
print(f"R²: {r2_2024_g:.4f}")
print(f"MAE: {mae_2024_g:,.2f}")
print(f"RMSE: {rmse_2024_g:,.2f}")
print(f"MAPE: {mape_2024_g:.2f} %")


Метрики на валидации:
R²: 0.9908
MAE: 163,530.05
RMSE: 378,190.55
MAPE: 14.31 %

Метрики на 2024.csv:
R²: 0.9969
MAE: 36,597.02
RMSE: 169,700.35
MAPE: 3.69 %
