In [1]:
import pandas as pd
import optuna
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, root_mean_squared_error


## RandomForest

In [2]:
df = pd.read_csv("dataset_vrp/ready_dataset.csv", index_col=0)

target = 'ВРП'

features = [
    'Округ',
    'Регион',
    'Инвестиции в основной капитал в млн.руб',
    'Численность населения',
    'Уровень безработицы от 15 д 72 лет, %',
    'Средняя ЗП в руб',
    'Оборот розничной торговли, млн.руб.',
    'Денежные доходы - всего, млн.руб.',
    'Внутренние текущие затраты на научные исследования и разработки, млн.руб',
    'Среднегодовая ставка ЦБ, %',
    'Годовая инфляция, %',
    'Курс Доллара к Рублю, ₽',
    'Цена фьючерса на нефть Brent, $',
    'Цена фьючерса на золото, $'
]

for col in ['Округ', 'Регион']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df[features]
y = df[target]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

cat_features = ['Округ', 'Регион']

df_2024 = pd.read_csv("dataset_vrp/2024.csv")


df_2024 = df_2024.dropna(subset=features).reset_index(drop=True)

for col in ['Округ', 'Регион']:
    le = LabelEncoder()   
    df_2024[col] = le.fit_transform(df[col])

y_true_2024 = df_2024["ВРП"]

X_test_2024 = df_2024[features]



In [3]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "random_state": 42,
        "n_jobs": -1,
    }

    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_pred)

    return mae

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-11-16 12:01:24,109] A new study created in memory with name: no-name-060f6401-409c-43b5-bae8-a360a45210ad


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-11-16 12:01:24,359] Trial 0 finished with value: 378233.6007945742 and parameters: {'n_estimators': 133, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 378233.6007945742.
[I 2025-11-16 12:01:24,697] Trial 1 finished with value: 539421.7079922021 and parameters: {'n_estimators': 455, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': None, 'bootstrap': False}. Best is trial 0 with value: 378233.6007945742.
[I 2025-11-16 12:01:25,424] Trial 2 finished with value: 556596.3453715261 and parameters: {'n_estimators': 698, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 7, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 378233.6007945742.
[I 2025-11-16 12:01:26,032] Trial 3 finished with value: 629889.9601018395 and parameters: {'n_estimators': 567, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'log2',

In [4]:
print("Лучшие параметры RandomForestRegressor:")
for key, value in study.best_params.items():
    print(f"{key}: {value}")

best_model = RandomForestRegressor(**study.best_params)
best_model.fit(X_train, y_train)
y_pred_2024 = best_model.predict(X_test_2024)


r2 = r2_score(y_true_2024, y_pred_2024)
mae = mean_absolute_error(y_true_2024, y_pred_2024)
mape = mean_absolute_percentage_error(y_true_2024, y_pred_2024)
rmse = root_mean_squared_error(y_true_2024, y_pred_2024)
print("\nТест на 2024")
print("R²:", f"{r2:.4f}")
print("MAE:", f"{mae:.4f}")
print(f"RMSE: " f"{rmse:.4f}")
print("MAPE:", f"{mape:.2%}")

Лучшие параметры RandomForestRegressor:
n_estimators: 491
max_depth: 18
min_samples_split: 3
min_samples_leaf: 1
max_features: log2
bootstrap: False

Тест на 2024
R²: 0.9759
MAE: 152304.4380
RMSE: 474202.9551
MAPE: 13.50%


## CatBoost

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    r2_score, mean_absolute_percentage_error
)
from catboost import CatBoostRegressor
import optuna

df = pd.read_csv("dataset_vrp/ready_dataset.csv", index_col=0)
df = df.dropna().reset_index(drop=True)

cat_features = ['Округ', 'Регион']

features = [
    'Округ', 'Регион',
    'Инвестиции в основной капитал в млн.руб',
    'Численность населения',
    'Уровень безработицы от 15 д 72 лет, %',
    'Средняя ЗП в руб',
    'Оборот розничной торговли, млн.руб.',
    'Денежные доходы - всего, млн.руб.',
    'Внутренние текущие затраты на научные исследования и разработки, млн.руб',
    'Среднегодовая ставка ЦБ, %',
    'Годовая инфляция, %',
    'Курс Доллара к Рублю, ₽',
    'Цена фьючерса на нефть Brent, $',
    'Цена фьючерса на золото, $'
]

X = df[features]
y = df['ВРП']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial):
    params = {
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "iterations": trial.suggest_int("iterations", 500, 1500),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "rsm": trial.suggest_float("rsm", 0.5, 1.0),
        "loss_function": "MAE",
        "eval_metric": "MAE",
        "random_seed": 42,
        "verbose": 0
    }

    model = CatBoostRegressor(**params)
    model.fit(
        X_train, y_train,
        cat_features=cat_features,
        eval_set=(X_valid, y_valid),
        verbose=0
    )
    
    y_pred = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_pred)
    return mae

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nЛучшие параметры:")
print(study.best_params)


[I 2025-11-16 12:03:05,351] A new study created in memory with name: no-name-162231a0-107a-4d43-8ebb-275d1f28cb5b


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-16 12:03:50,073] Trial 0 finished with value: 338158.8751439716 and parameters: {'depth': 7, 'learning_rate': 0.0499582582721463, 'iterations': 1092, 'l2_leaf_reg': 5.722383621984964, 'bagging_temperature': 0.3701329038596327, 'rsm': 0.7125328233316897}. Best is trial 0 with value: 338158.8751439716.
[I 2025-11-16 12:04:45,428] Trial 1 finished with value: 304599.650010423 and parameters: {'depth': 8, 'learning_rate': 0.16208892186845164, 'iterations': 1103, 'l2_leaf_reg': 5.958036572390518, 'bagging_temperature': 0.4647126346406696, 'rsm': 0.7715361285871508}. Best is trial 1 with value: 304599.650010423.
[I 2025-11-16 12:05:53,577] Trial 2 finished with value: 436640.4330582768 and parameters: {'depth': 8, 'learning_rate': 0.07952182601739817, 'iterations': 1406, 'l2_leaf_reg': 1.4561051740773179, 'bagging_temperature': 0.11763526273419533, 'rsm': 0.9728441192337458}. Best is trial 1 with value: 304599.650010423.
[I 2025-11-16 12:06:10,715] Trial 3 finished with value: 374

In [6]:
best_params = study.best_params
best_model = CatBoostRegressor(
    **best_params,
    loss_function="MAE",
    random_seed=42,
    verbose=100
)

best_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_valid, y_valid)
)

y_pred = best_model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
mape = mean_absolute_percentage_error(y_valid, y_pred) * 100
r2 = r2_score(y_valid, y_pred)
rmse = root_mean_squared_error(y_valid, y_pred)

print("\nМетрики на валидации:")
print(f"R²: {r2:.4f}")
print(f"MAE: {mae:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"MAPE: {mape:.2f} %")


0:	learn: 1049728.4323181	test: 1323259.2473816	best: 1323259.2473816 (0)	total: 30.3ms	remaining: 31.4s
100:	learn: 248137.6307559	test: 459675.6805713	best: 459675.6805713 (100)	total: 3.85s	remaining: 35.7s
200:	learn: 117698.9612776	test: 291763.1544253	best: 291763.1544253 (200)	total: 7.71s	remaining: 32.1s
300:	learn: 87891.0827590	test: 265337.3149904	best: 264825.7760232 (295)	total: 11.5s	remaining: 28.1s
400:	learn: 68378.9287179	test: 227723.5480058	best: 227692.6166047 (398)	total: 15.4s	remaining: 24.4s
500:	learn: 56395.4208120	test: 211765.0752558	best: 210176.2050290 (465)	total: 19.3s	remaining: 20.7s
600:	learn: 49258.1358801	test: 209338.2652120	best: 209338.2652120 (600)	total: 23.1s	remaining: 16.8s
700:	learn: 41789.9932762	test: 204555.2739071	best: 203856.2403637 (681)	total: 27.1s	remaining: 13s
800:	learn: 37559.2446519	test: 201502.8384854	best: 201437.6217230 (769)	total: 31s	remaining: 9.12s
900:	learn: 33355.5125482	test: 197793.1820701	best: 197721.07002

In [7]:
df_2024 = pd.read_csv("dataset_vrp/2024.csv")
df_2024 = df_2024.dropna(subset=features).reset_index(drop=True)

for col in cat_features:
    df_2024[col] = df_2024[col].astype(str)
    X[col] = X[col].astype(str)

X_2024 = df_2024[features]
y_true_2024 = df_2024["ВРП"]

y_pred_2024 = best_model.predict(X_2024)

mae_2024 = mean_absolute_error(y_true_2024, y_pred_2024)
mape_2024 = mean_absolute_percentage_error(y_true_2024, y_pred_2024) * 100
r2_2024 = r2_score(y_true_2024, y_pred_2024)
rmse_2024 = root_mean_squared_error(y_true_2024, y_pred_2024)

print("\nМетрики на 2024.csv:")
print(f"R²: {r2_2024:.4f}")
print(f"MAE: {mae_2024:,.2f}")
print(f"RMSE: {rmse_2024:,.2f}")
print(f"MAPE: {mape_2024:.2f} %")


Метрики на 2024.csv:
R²: 0.9824
MAE: 99,745.43
RMSE: 404,853.73
MAPE: 6.29 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype(str)
