In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, explained_variance_score
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.sklearn
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("financial_regression.csv")
df = df.dropna(subset=['gold close'])
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['dayofweek'] = df['date'].dt.dayofweek
df['day'] = df['date'].dt.day
df = df.set_index(df.date)
df = df.drop(columns=['date'])

from statsmodels.tsa.stattools import adfuller

numeric_columns = df.select_dtypes(include=['number']).columns
stationary_df = pd.DataFrame(index=df.index)

In [3]:
print("Проверка на стационарность (ADF тест):")
for column in numeric_columns:
    series = df[column]
    result = adfuller(series.dropna(), autolag='AIC')
    p_value = result[1]
    print(f"{column}: p-value = {p_value:.4f}")
    if p_value > 0.05:
        stationary_df[column] = series.diff().fillna(0)
        print(f"    -> {column} нестационарен, применена первая разность.")
    else:
        stationary_df[column] = series
        print(f"    -> {column} стационарен.")

df = stationary_df

Проверка на стационарность (ADF тест):
sp500 open: p-value = 0.9941
    -> sp500 open нестационарен, применена первая разность.
sp500 high: p-value = 0.9968
    -> sp500 high нестационарен, применена первая разность.
sp500 low: p-value = 0.9948
    -> sp500 low нестационарен, применена первая разность.
sp500 close: p-value = 0.9960
    -> sp500 close нестационарен, применена первая разность.
sp500 volume: p-value = 0.0001
    -> sp500 volume стационарен.
sp500 high-low: p-value = 0.0001
    -> sp500 high-low стационарен.
nasdaq open: p-value = 0.9964
    -> nasdaq open нестационарен, применена первая разность.
nasdaq high: p-value = 0.9976
    -> nasdaq high нестационарен, применена первая разность.
nasdaq low: p-value = 0.9974
    -> nasdaq low нестационарен, применена первая разность.
nasdaq close: p-value = 0.9974
    -> nasdaq close нестационарен, применена первая разность.
nasdaq volume: p-value = 0.0000
    -> nasdaq volume стационарен.
nasdaq high-low: p-value = 0.0454
    -> na

In [4]:
for col in df.columns:
    if col != 'gold close':
        df[f'{col}_lag1'] = df[col].shift(1)
        df[f'{col}_lag2'] = df[col].shift(2)
        df[f'{col}_roll_mean3'] = df[col].rolling(window=3).mean()
        df[f'{col}_roll_mean7'] = df[col].rolling(window=7).mean()

df['gold_close_lag1'] = df['gold close'].shift(1)
df['gold_close_lag2'] = df['gold close'].shift(2)
df['gold_close_roll_mean3'] = df['gold close'].rolling(window=3).mean()
df['gold_close_roll_mean7'] = df['gold close'].rolling(window=7).mean()

df = df.dropna()

In [5]:
pt = PowerTransformer(method='yeo-johnson', standardize=True)
X = df.drop(columns=['gold close'])
y = df['gold close']
X_transformed = pt.fit_transform(X)
X = pd.DataFrame(X_transformed, columns=X.columns)

In [17]:
def calc_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    n = len(y_true)
    p = X.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    medae = median_absolute_error(y_true, y_pred)
    ev = explained_variance_score(y_true, y_pred)
    mean_median_diff = np.mean(y_pred) - np.median(y_pred)
    return {
        "MSE": mse,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "Adjusted R2": adj_r2,
        "MAPE": mape,
        "MedAE": medae,
        "Explained Variance": ev,
        "Mean - Median": mean_median_diff
    }

In [7]:
def objective_catboost(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "random_seed": 42,
        "verbose": 0
    }
    tscv = TimeSeriesSplit(n_splits=5)
    rmses = []
    for train_idx, val_idx in tscv.split(X):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostRegressor(**params)
        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_val_fold)
        mse = mean_squared_error(y_val_fold, preds)
        rmse = np.sqrt(mse)
        rmses.append(rmse)
    return np.mean(rmses)

In [8]:
def objective_lgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 15, 100),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "random_state": 42,
        "n_jobs": -1
    }
    tscv = TimeSeriesSplit(n_splits=5)
    rmses = []
    for train_idx, val_idx in tscv.split(X):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        model = LGBMRegressor(**params)
        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_val_fold)
        mse = mean_squared_error(y_val_fold, preds)
        rmse = np.sqrt(mse)
        rmses.append(rmse)
    return np.mean(rmses)

In [9]:
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "random_state": 42,
        "n_jobs": -1,
        "objective": "reg:squarederror"
    }
    tscv = TimeSeriesSplit(n_splits=5)
    rmses = []
    for train_idx, val_idx in tscv.split(X):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBRegressor(**params)
        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_val_fold)
        mse = mean_squared_error(y_val_fold, preds)
        rmse = np.sqrt(mse)
        rmses.append(rmse)
    return np.mean(rmses)

In [10]:
print("Оптимизация CatBoost...")
study_catboost = optuna.create_study(direction="minimize")
study_catboost.optimize(objective_catboost, n_trials=2)
print("Лучшие параметры CatBoost:", study_catboost.best_params)

[I 2025-05-17 20:59:47,989] A new study created in memory with name: no-name-f2ed642a-61c7-46e4-aaf1-d9333fbc33b2


Оптимизация CatBoost...


[I 2025-05-17 21:00:10,693] Trial 0 finished with value: 0.4460040755705578 and parameters: {'iterations': 762, 'depth': 4, 'learning_rate': 0.09670270339074258, 'l2_leaf_reg': 8.853190848918048}. Best is trial 0 with value: 0.4460040755705578.
[I 2025-05-17 21:03:49,704] Trial 1 finished with value: 0.5790778054544201 and parameters: {'iterations': 706, 'depth': 8, 'learning_rate': 0.2939881209110717, 'l2_leaf_reg': 2.7896163271360246}. Best is trial 0 with value: 0.4460040755705578.


Лучшие параметры CatBoost: {'iterations': 762, 'depth': 4, 'learning_rate': 0.09670270339074258, 'l2_leaf_reg': 8.853190848918048}


In [11]:
print("Оптимизация LightGBM...")
study_lgbm = optuna.create_study(direction="minimize")
study_lgbm.optimize(objective_lgbm, n_trials=2)
print("Лучшие параметры LightGBM:", study_lgbm.best_params)

[I 2025-05-17 21:03:49,716] A new study created in memory with name: no-name-8274fba6-c86c-4071-9571-2c96856dc22d


Оптимизация LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40992
[LightGBM] [Info] Number of data points in the train set: 574, number of used features: 229
[LightGBM] [Info] Start training from score 0.103354
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006859 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55082
[LightGBM] [Info] Number of data points in the train set: 1147, number of used features: 230
[LightGBM] [Info] Start training from score 0.013094
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009545 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55089
[LightGBM] [Info] Number of data points in the train set: 1720, number of used features: 230
[

[I 2025-05-17 21:03:55,322] Trial 0 finished with value: 0.5322472861007197 and parameters: {'n_estimators': 732, 'max_depth': 12, 'learning_rate': 0.29684058809807423, 'num_leaves': 63, 'subsample': 0.8280059090247472, 'colsample_bytree': 0.8147813147282634, 'reg_alpha': 0.6806475857372513, 'reg_lambda': 0.5061731798240263}. Best is trial 0 with value: 0.5322472861007197.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40992
[LightGBM] [Info] Number of data points in the train set: 574, number of used features: 229
[LightGBM] [Info] Start training from score 0.103354
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55082
[LightGBM] [Info] Number of data points in the train set: 1147, number of used features: 230
[LightGBM] [Info] Start training from score 0.013094
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55089
[LightGBM] [Info] Number of data points in the train set: 1720, number of used features: 230
[LightGBM] [Info] Start t

[I 2025-05-17 21:04:03,853] Trial 1 finished with value: 0.5195764930019174 and parameters: {'n_estimators': 586, 'max_depth': 11, 'learning_rate': 0.16125961163276858, 'num_leaves': 48, 'subsample': 0.6118536433613457, 'colsample_bytree': 0.8299319153068697, 'reg_alpha': 0.5468495990987878, 'reg_lambda': 0.7412871948476571}. Best is trial 1 with value: 0.5195764930019174.


Лучшие параметры LightGBM: {'n_estimators': 586, 'max_depth': 11, 'learning_rate': 0.16125961163276858, 'num_leaves': 48, 'subsample': 0.6118536433613457, 'colsample_bytree': 0.8299319153068697, 'reg_alpha': 0.5468495990987878, 'reg_lambda': 0.7412871948476571}


In [12]:
print("Оптимизация XGBoost...")
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=2)
print("Лучшие параметры XGBoost:", study_xgb.best_params)

[I 2025-05-17 21:04:03,868] A new study created in memory with name: no-name-e85f558f-cf8c-4e80-99d8-3a6a31dd40d1


Оптимизация XGBoost...


[I 2025-05-17 21:04:12,792] Trial 0 finished with value: 0.5443565115328843 and parameters: {'n_estimators': 641, 'max_depth': 4, 'learning_rate': 0.28536879562200745, 'subsample': 0.9680839317997793, 'colsample_bytree': 0.8855376569596474, 'gamma': 1.8397982506398491, 'reg_alpha': 0.5556473158203624, 'reg_lambda': 0.42263017214933263}. Best is trial 0 with value: 0.5443565115328843.
[I 2025-05-17 21:04:26,019] Trial 1 finished with value: 0.5229328565940197 and parameters: {'n_estimators': 823, 'max_depth': 4, 'learning_rate': 0.04639210385797938, 'subsample': 0.8704951143897179, 'colsample_bytree': 0.9689151035126222, 'gamma': 3.5065599042081, 'reg_alpha': 0.36018289128348513, 'reg_lambda': 0.46755102229392154}. Best is trial 1 with value: 0.5229328565940197.


Лучшие параметры XGBoost: {'n_estimators': 823, 'max_depth': 4, 'learning_rate': 0.04639210385797938, 'subsample': 0.8704951143897179, 'colsample_bytree': 0.9689151035126222, 'gamma': 3.5065599042081, 'reg_alpha': 0.36018289128348513, 'reg_lambda': 0.46755102229392154}


In [13]:
final_cat = CatBoostRegressor(**study_catboost.best_params, verbose=0, random_seed=42)
final_cat.fit(X, y)

<catboost.core.CatBoostRegressor at 0x14623905f00>

In [14]:
final_lgbm = LGBMRegressor(**study_lgbm.best_params, random_state=42)
final_lgbm.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55101
[LightGBM] [Info] Number of data points in the train set: 3439, number of used features: 231
[LightGBM] [Info] Start training from score 0.036185


In [15]:

final_xgb = XGBRegressor(**study_xgb.best_params, random_state=42, objective="reg:squarederror")
final_xgb.fit(X, y)

In [18]:
tscv = TimeSeriesSplit(n_splits=5)

models = {
    'CatBoost': final_cat,
    'LightGBM': final_lgbm,
    'XGBoost': final_xgb
}

mlflow.set_experiment("Financial_Regression_TimeSeries")

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        metrics_summary = {metric: [] for metric in ["MSE", "MAE", "RMSE", "R2", "Adjusted R2", "MAPE", "MedAE", "Explained Variance", "Mean - Median"]}
        
        for train_idx, val_idx in tscv.split(X):
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

            model.fit(X_train_fold, y_train_fold)
            preds = model.predict(X_val_fold)

            fold_metrics = calc_metrics(y_val_fold, preds)
            for key, value in fold_metrics.items():
                metrics_summary[key].append(value)

        # Средние метрики по фолдам
        avg_metrics = {key: np.mean(vals) for key, vals in metrics_summary.items()}

        # Логирование параметров и метрик
        mlflow.log_params(model.get_params())
        mlflow.log_metrics(avg_metrics)

        print(f"--- {model_name} ---")
        for metric_name, metric_value in avg_metrics.items():
            print(f"{metric_name}: {metric_value:.4f}")

        # Логируем модель
        mlflow.sklearn.log_model(model, artifact_path="model")

print("Готово!")

--- CatBoost ---
MSE: 0.2130
MAE: 0.3087
RMSE: 0.4460
R2: 0.8942
Adjusted R2: 0.8127
MAPE: inf
MedAE: 0.2280
Explained Variance: 0.8949
Mean - Median: 0.0043




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003599 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40992
[LightGBM] [Info] Number of data points in the train set: 574, number of used features: 229
[LightGBM] [Info] Start training from score 0.103354
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55082
[LightGBM] [Info] Number of data points in the train set: 1147, number of used features: 230
[LightGBM] [Info] Start training from score 0.013094
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 55089
[LightGBM] [Info] Number of data points in the train set: 1720, number of used features: 230
[LightGBM] [Info] Start t



--- XGBoost ---
MSE: 0.2903
MAE: 0.3633
RMSE: 0.5229
R2: 0.8545
Adjusted R2: 0.7423
MAPE: inf
MedAE: 0.2747
Explained Variance: 0.8550
Mean - Median: 0.0025




Готово!
