In [1]:
import os, json, time, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from pmdarima import auto_arima
import time, joblib

import mlflow
import mlflow.sklearn
from dotenv import load_dotenv
load_dotenv()

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

# MLflow трекинг (тот же URI, что использует API)
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://84.201.144.227:8000"))
mlflow.set_experiment("financial_timeseries_regression")
print("Tracking URI:", mlflow.get_tracking_uri())

Tracking URI: http://84.201.144.227:8000


In [2]:
DATA_PATH = os.getenv("DATA_PATH", "data/financial_regression.csv")
df = pd.read_csv(DATA_PATH)

# обязательные шаги
assert "date" in df.columns, "В датасете отсутствует колонка 'date'"
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
df = df.set_index("date")

# целевая переменная
TARGET = os.getenv("TARGET", "gold close")
assert TARGET in df.columns, f"Не найден таргет '{TARGET}' в колонках"

# базовые пропуски по числовым
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].ffill().bfill()

print("Shape:", df.shape, "| Target:", TARGET)
df.head()

Shape: (3904, 46) | Target: gold close


Unnamed: 0_level_0,sp500 open,sp500 high,sp500 low,sp500 close,sp500 volume,sp500 high-low,nasdaq open,nasdaq high,nasdaq low,nasdaq close,nasdaq volume,nasdaq high-low,us_rates_%,CPI,usd_chf,eur_usd,GDP,silver open,silver high,silver low,silver close,silver volume,silver high-low,oil open,oil high,oil low,oil close,oil volume,oil high-low,platinum open,platinum high,platinum low,platinum close,platinum volume,platinum high-low,palladium open,palladium high,palladium low,palladium close,palladium volume,palladium high-low,gold open,gold high,gold low,gold close,gold volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
2010-01-14,114.49,115.14,114.42,114.93,115646960.0,0.72,46.26,46.52,46.22,46.39,75209000.0,0.3,0.13,217.281,1.0206,1.4478,14980.193,18.58,18.7,18.45,18.66,69804.0,0.25,39.3,39.44,38.88,39.06,8575821.0,0.56,160.13,161.85,159.72,161.45,228500.0,2.13,43.93,45.02,43.86,44.84,364528.0,1.16,111.51,112.37,110.79,112.03,18305238.0
2010-01-15,114.73,114.84,113.2,113.64,212252769.0,1.64,46.46,46.55,45.65,45.85,126849300.0,0.9,0.13,217.281,1.0264,1.4376,14980.193,18.54,18.54,18.36,18.41,205106.0,0.18,38.97,39.02,38.28,38.4,12615299.0,0.74,160.68,161.25,159.5,160.25,189900.0,1.75,45.18,45.76,44.4,45.76,442210.0,1.36,111.35,112.01,110.38,110.86,18000724.0
2010-01-18,114.73,114.84,113.2,113.64,212252769.0,1.64,46.46,46.55,45.65,45.85,126849300.0,0.9,0.13,217.281,1.0264,1.4376,14980.193,18.54,18.54,18.36,18.41,205106.0,0.18,38.97,39.02,38.28,38.4,12615299.0,0.74,160.68,161.25,159.5,160.25,189900.0,1.75,45.18,45.76,44.4,45.76,442210.0,1.36,111.35,112.01,110.38,110.86,18000724.0
2010-01-19,113.62,115.13,113.59,115.06,138671890.0,1.54,45.96,46.64,45.95,46.59,84388200.0,0.69,0.13,217.281,1.034,1.4269,14980.193,18.53,18.79,18.5,18.79,130552.0,0.29,38.07,38.9499,38.03,38.93,8180157.0,0.9199,162.23,165.2,162.1722,165.12,295900.0,3.0278,46.01,47.08,45.7,46.94,629150.0,1.38,110.95,111.75,110.83,111.52,10467927.0
2010-01-20,114.28,114.45,112.98,113.89,216330645.0,1.47,46.27,46.604,45.43,45.92,145680000.0,1.174,0.13,217.281,1.0453,1.4094,14980.193,18.31,18.31,17.78,17.86,230907.0,0.53,38.23,38.35,37.88,37.89,11438685.0,0.47,163.51,163.51,159.09,163.22,438200.0,4.42,46.59,47.31,45.17,47.05,643198.0,2.14,109.97,110.05,108.46,108.94,17534231.0


In [3]:
class TimeSeriesFeats(BaseEstimator, TransformerMixin):
    """
    Универсальный генератор признаков: лаги и скользящие средние для всех числовых столбцов (кроме таргета).
    """
    def __init__(self, lags=(1, 2, 5), rolls=(3, 7, 14)):
        self.lags = lags
        self.rolls = rolls
        self.feature_names_ = None
        self.cols_ = None

    def fit(self, X, y=None):
        self.cols_ = [c for c in X.columns if c != TARGET]
        feats = []
        for c in self.cols_:
            for l in self.lags:
                feats.append(f"{c}_lag{l}")
            for w in self.rolls:
                feats.append(f"{c}_roll_mean{w}")
        self.feature_names_ = self.cols_ + feats
        return self

    def transform(self, X):
        X = X.copy()
        cols = [c for c in X.columns if c != TARGET]
        feats = []
        for c in cols:
            feats.append(X[c])
            for l in self.lags:
                feats.append(X[c].shift(l).rename(f"{c}_lag{l}"))
            for w in self.rolls:
                feats.append(X[c].rolling(window=w, min_periods=1).mean().rename(f"{c}_roll_mean{w}"))
        out = pd.concat(feats, axis=1)
        out = out.ffill().bfill()
        out = out.reindex(columns=self.feature_names_, fill_value=0)
        return out

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_) if self.feature_names_ is not None else np.array([])

feat_gen = TimeSeriesFeats(lags=(1,2,5), rolls=(3,7,14))

In [4]:
TEST_RATIO = float(os.getenv("TEST_RATIO", "0.2"))
split_idx = int(len(df) * (1 - TEST_RATIO))

df_train = df.iloc[:split_idx].copy()
df_test  = df.iloc[split_idx:].copy()

y_train = df_train[TARGET]
y_test  = df_test[TARGET]

X_train_raw = df_train.drop(columns=[TARGET])
X_test_raw  = df_test.drop(columns=[TARGET])

def regression_report(y_true, y_pred):
    y_true = pd.Series(y_true, copy=False).astype(float)
    y_pred = pd.Series(y_pred, copy=False).astype(float)

    y_true = y_true.replace([np.inf, -np.inf], np.nan)
    y_pred = y_pred.replace([np.inf, -np.inf], np.nan)

    aligned = pd.concat([y_true.rename("y_true"),
                         y_pred.rename("y_pred")], axis=1).dropna()

    if aligned.empty:
        return None

    yt = aligned["y_true"].to_numpy().ravel()
    yp = aligned["y_pred"].to_numpy().ravel()

    mae  = mean_absolute_error(yt, yp)
    mse  = mean_squared_error(yt, yp)
    rmse = np.sqrt(mse)
    denom = np.clip(np.abs(yt), 1e-8, None)
    mape = np.mean(np.abs((yt - yp) / denom))
    r2   = r2_score(yt, yp)
    return {"MAE": mae, "RMSE": rmse, "MAPE": mape, "R2": r2}

In [5]:
def log_regression_plots(y_true, y_pred, model_name):
    fig1, ax1 = plt.subplots(figsize=(8,5))
    ax1.plot(y_true.index, y_true.values, label="True")
    ax1.plot(y_true.index, y_pred, label="Pred")
    ax1.set_title(f"{model_name}: True vs Pred")
    ax1.legend(); ax1.grid(True)
    plt.tight_layout()
    mlflow.log_figure(fig1, f"{model_name}_true_vs_pred.png")
    plt.close(fig1)

    resid = y_true.values - y_pred
    fig2, ax2 = plt.subplots(figsize=(7,4))
    ax2.hist(resid, bins=40)
    ax2.set_title(f"{model_name}: residuals hist")
    plt.tight_layout()
    mlflow.log_figure(fig2, f"{model_name}_residuals_hist.png")
    plt.close(fig2)

def mlflow_log_run(model_name, pipeline, X_train_raw, y_train, X_test_raw, y_test, save_model=True):
    start = time.time()
    X_train = pipeline.named_steps["features"].fit_transform(X_train_raw.assign(**{TARGET: y_train}))
    pipeline.named_steps["scaler"].fit(X_train)  # scaler по train
    X_train_scaled = pipeline.named_steps["scaler"].transform(X_train)

    pipeline.named_steps["reg"].fit(X_train_scaled, y_train)
    train_time = time.time() - start

    X_test = pipeline.named_steps["features"].transform(X_test_raw.assign(**{TARGET: y_test}))
    X_test_scaled = pipeline.named_steps["scaler"].transform(X_test)
    y_pred = pipeline.named_steps["reg"].predict(X_test_scaled)

    metrics = regression_report(y_test, y_pred)
    metrics["train_time_sec"] = train_time
    print(model_name, metrics)

    for k, v in metrics.items():
        mlflow.log_metric(f"{model_name}_{k}", float(v))

    log_regression_plots(y_test, y_pred, model_name)

    try:
        reg = pipeline.named_steps["reg"]
        if hasattr(reg, "feature_importances_"):
            feat_names = pipeline.named_steps["features"].get_feature_names_out().tolist()
            imp = pd.DataFrame({"feature": feat_names, "importance": reg.feature_importances_}).sort_values("importance", ascending=False)
            imp.to_csv(f"{model_name}_feature_importances.csv", index=False)
            mlflow.log_artifact(f"{model_name}_feature_importances.csv")
            # топ-график
            top = imp.head(40)
            fig, ax = plt.subplots(figsize=(10, min(12, 0.35*len(top))))
            ax.barh(top["feature"][::-1], top["importance"][::-1])
            ax.set_title(f"{model_name} Feature Importances")
            plt.tight_layout()
            mlflow.log_figure(fig, f"{model_name}_feature_importances.png")
            plt.close(fig)
    except Exception as e:
        print("FI logging error:", e)

    feat_names = pipeline.named_steps["features"].get_feature_names_out().tolist()
    with open("features_schema.json", "w", encoding="utf-8") as f:
        json.dump({"features": feat_names}, f, ensure_ascii=False, indent=2)
    mlflow.log_artifact("features_schema.json")

    if save_model:
        mlflow.sklearn.log_model(pipeline, artifact_path=f"{model_name}_pipeline")

    return y_pred, metrics

In [6]:
def make_pipeline(reg):
    use_scaler = not isinstance(reg, CatBoostRegressor)  # CatBoost не скейлим
    return Pipeline(steps=[
        ("features", feat_gen),
        ("scaler", StandardScaler(with_mean=False) if use_scaler else "passthrough"),
        ("reg", reg),
    ])

models = {
    "LGBM": LGBMRegressor(
        random_state=42, n_estimators=1000, learning_rate=0.03,
        num_leaves=63, subsample=0.9, colsample_bytree=0.8
    ),
    "XGB": XGBRegressor(
        random_state=42, n_estimators=1200, learning_rate=0.03,
        max_depth=8, subsample=0.9, colsample_bytree=0.8, tree_method="hist"
    ),
    "CAT": CatBoostRegressor(
        random_state=42, verbose=0, iterations=800, depth=6,
        learning_rate=0.05, loss_function="RMSE"
    ),
}

In [7]:
USE_ARIMA = True
MAX_POINTS = int(os.getenv("ARIMA_MAX_POINTS", "2000"))
SEASONAL_PERIOD = int(os.getenv("SEASONAL_PERIOD", "5"))

if USE_ARIMA:
    with mlflow.start_run(run_name="ARIMA_SARIMA"):
        # 2.1 Чистим train
        series_train = pd.Series(y_train, copy=True).astype(float).replace([np.inf, -np.inf], np.nan).dropna()
        if len(series_train) > MAX_POINTS:
            series_train = series_train.iloc[-MAX_POINTS:]

        # Если после чистки нечего учить — выходим мягко
        if series_train.empty:
            mlflow.log_param("ARIMA_note", "empty_train_after_cleaning")
            print("ARIMA: train series is empty after cleaning — skipping.")
        else:
            start = time.time()
            arima = auto_arima(
                series_train,
                seasonal=True, m=SEASONAL_PERIOD,
                stepwise=True, suppress_warnings=True, trace=False,
                max_p=3, max_q=3, max_P=2, max_Q=2,
                d=None, D=None, error_action="ignore"
            )
            fit_time = time.time() - start
            mlflow.log_metric("ARIMA_fit_time_sec", fit_time)

            # 2.2 Готовим чистый y_test и горизонт прогноза
            y_test_clean = pd.Series(y_test, copy=False).astype(float).replace([np.inf, -np.inf], np.nan).dropna()
            n_forecast = len(y_test_clean)

            if n_forecast <= 0:
                mlflow.log_param("ARIMA_note", "empty_test_after_cleaning")
                print("ARIMA: test series is empty after cleaning — skipping.")
            else:
                # 2.3 Прогноз и санитизация
                y_forecast = pd.Series(arima.predict(n_periods=n_forecast), index=y_test_clean.index)
                # fallback: если NaN в прогнозе — пробуем без сезонности
                if np.isnan(y_forecast.values).any():
                    arima_ns = auto_arima(
                        series_train, seasonal=False, stepwise=True,
                        suppress_warnings=True, trace=False,
                        max_p=3, max_q=3, d=None, error_action="ignore"
                    )
                    y_forecast = pd.Series(arima_ns.predict(n_periods=n_forecast), index=y_test_clean.index)

                # финальная чистка прогноза
                y_forecast = y_forecast.replace([np.inf, -np.inf], np.nan)

                # 2.4 Метрики (через устойчивую regression_report)
                mets = regression_report(y_test_clean, y_forecast)

                if mets is None:
                    mlflow.log_param("ARIMA_note", "no_pairs_after_alignment")
                    print("ARIMA: no valid pairs after alignment — skipping metrics.")
                else:
                    for k, v in mets.items():
                        mlflow.log_metric(f"ARIMA_{k}", float(v))
                    print("ARIMA:", mets)
                    log_regression_plots(y_test_clean, y_forecast.values, "ARIMA")

                joblib.dump(arima, "arima_model.pkl")
                mlflow.log_artifact("arima_model.pkl")

ARIMA: no valid pairs after alignment — skipping metrics.
🏃 View run ARIMA_SARIMA at: http://84.201.144.227:8000/#/experiments/8/runs/4ff41088ed76426bab44d7fff38c9da4
🧪 View experiment at: http://84.201.144.227:8000/#/experiments/8
