In [1]:
import os
import warnings
import tempfile
import traceback
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
load_dotenv()

import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

import optuna
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

import matplotlib.pyplot as plt
plt.switch_backend("Agg")  # на всякий случай для headless

In [2]:
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "http://84.201.144.227:8000")
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
EXPERIMENT_NAME = "financial_timeseries_regression"
mlflow.set_experiment(EXPERIMENT_NAME)

print("MLflow URI:", mlflow.get_tracking_uri())
print("Experiment:", EXPERIMENT_NAME)

MLflow URI: http://84.201.144.227:8000
Experiment: financial_timeseries_regression


In [3]:
df = pd.read_csv("data/financial_regression.csv")
df["date"] = pd.to_datetime(df["date"])
df = df.set_index("date")
df.dropna(subset=["gold close"], inplace=True)

# календарные признаки
df["year"] = df.index.year
df["month"] = df.index.month
df["dayofweek"] = df.index.dayofweek


In [4]:
df_fe = df.copy()
key_features = [c for c in ["silver close", "oil close", "dxy close"] if c in df_fe.columns]
for col in key_features:
    df_fe[f"{col}_lag1"] = df_fe[col].shift(1)
    df_fe[f"{col}_roll_mean3"] = df_fe[col].rolling(window=3).mean()

df_fe["gold_close_lag1"] = df_fe["gold close"].shift(1)

y = df_fe["gold close"]
X = df_fe.drop(columns=["gold close"])

mask = y.notna()
X = X[mask]
y = y[mask]

# train/test по времени (80/20)
split_idx = int(len(X) * 0.8)
X_train_raw, X_test_raw = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

feature_names = X_train_raw.columns.tolist()


In [5]:
def calc_metrics(y_true, y_pred):
    m = {
        "mae":  mean_absolute_error(y_true, y_pred),
        "mse":  mean_squared_error(y_true, y_pred),
        "rmse": np.sqrt(mean_squared_error(y_true, y_pred)),
        "r2":   r2_score(y_true, y_pred),
        "mape": mean_absolute_percentage_error(y_true, y_pred),
    }
    return m

def log_metrics_dict(mdict):
    for k, v in mdict.items():
        mlflow.log_metric(k, float(v))

# =========================
#  Вспомогательные артефакты
# =========================
def save_predictions(y_true, y_pred, model_name):
    out = pd.DataFrame({"y_true": y_true.values, "y_pred": np.asarray(y_pred)}, index=y_true.index)
    fname = f"{model_name}_predictions.csv"
    out.to_csv(fname)
    mlflow.log_artifact(fname)

def log_feature_importances(estimator_or_pipeline, model_name, names):
    # принимает и pipeline, и «голую» модель
    if hasattr(estimator_or_pipeline, "named_steps"):
        model = estimator_or_pipeline.named_steps.get("model", estimator_or_pipeline)
    else:
        model = estimator_or_pipeline

    importances = None
    # CatBoost
    if hasattr(model, "get_feature_importance"):
        try:
            importances = np.asarray(model.get_feature_importance(), dtype=float)
        except Exception:
            importances = None
    # XGB/LGBM/sklearn
    if importances is None and hasattr(model, "feature_importances_"):
        importances = np.asarray(model.feature_importances_, dtype=float)

    if importances is None:
        print(f"[{model_name}] feature importances not available — skipped.")
        return

    fi = pd.DataFrame({"feature": names, "importance": importances})
    fi = fi.sort_values("importance", ascending=False)

    csv_name = f"feature_importances_{model_name}.csv"
    fi.to_csv(csv_name, index=False)
    mlflow.log_artifact(csv_name)

    top = fi.head(30).iloc[::-1]
    plt.figure(figsize=(10, max(6, len(top)*0.3)))
    plt.barh(top["feature"], top["importance"])
    plt.title(f"Feature importances — {model_name}")
    plt.tight_layout()
    png_name = f"feature_importances_{model_name}.png"
    plt.savefig(png_name, dpi=150)
    plt.close()
    mlflow.log_artifact(png_name)

def log_summary_text(model_name, params, metrics):
    lines = [f"Model: {model_name}", "Params:"]
    for k, v in (params or {}).items():
        lines.append(f"  {k}: {v}")
    lines.append("Metrics:")
    for k, v in metrics.items():
        lines.append(f"  {k}: {v}")
    txt = "\n".join(lines)
    fname = f"{model_name}_summary.txt"
    with open(fname, "w", encoding="utf-8") as f:
        f.write(txt)
    mlflow.log_artifact(fname)

In [6]:
def tune_xgb(n_trials=10, timeout=600):
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 300),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 6),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
            "objective": "reg:squarederror",
            "n_jobs": -1,
            "random_state": 42,
        }
        model = XGBRegressor(**params)
        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("model", model),
        ])
        tscv = TimeSeriesSplit(n_splits=3)
        rmses = []
        for tr_idx, val_idx in tscv.split(X_train_raw):
            pipe.fit(X_train_raw.iloc[tr_idx], y_train.iloc[tr_idx])
            pred = pipe.predict(X_train_raw.iloc[val_idx])
            rmse = np.sqrt(mean_squared_error(y_train.iloc[val_idx], pred))
            rmses.append(rmse)
        return float(np.mean(rmses))
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, timeout=timeout)
    return study.best_params

In [7]:
def tune_lgbm(n_trials=10, timeout=600):
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 300),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 20, 50),
            "max_depth": trial.suggest_int("max_depth", 3, 6),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "random_state": 42,
            "verbose": -1,
        }
        model = LGBMRegressor(**params)
        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("model", model),
        ])
        tscv = TimeSeriesSplit(n_splits=3)
        rmses = []
        for tr_idx, val_idx in tscv.split(X_train_raw):
            pipe.fit(X_train_raw.iloc[tr_idx], y_train.iloc[tr_idx])
            pred = pipe.predict(X_train_raw.iloc[val_idx])
            rmse = np.sqrt(mean_squared_error(y_train.iloc[val_idx], pred))
            rmses.append(rmse)
        return float(np.mean(rmses))
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, timeout=timeout)
    return study.best_params

In [8]:
def tune_cat(n_trials=10, timeout=600):
    def objective(trial):
        params = {
            "iterations": trial.suggest_int("iterations", 150, 400),
            "depth": trial.suggest_int("depth", 3, 8),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0, log=True),
            "random_state": 42,
            "verbose": 0,
        }
        model = CatBoostRegressor(**params)
        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("model", model),
        ])
        tscv = TimeSeriesSplit(n_splits=3)
        rmses = []
        for tr_idx, val_idx in tscv.split(X_train_raw):
            pipe.fit(X_train_raw.iloc[tr_idx], y_train.iloc[tr_idx])
            pred = pipe.predict(X_train_raw.iloc[val_idx])
            rmse = np.sqrt(mean_squared_error(y_train.iloc[val_idx], pred))
            rmses.append(rmse)
        return float(np.mean(rmses))
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, timeout=timeout)
    return study.best_params

In [9]:
def final_run(model_name, pipeline, params):
    """
    Отдельный run на каждую модель:
      - логируем params/metrics/артефакты (predictions/FI/summary)
      - и ГЛАВНОЕ: создаём в артефактах папку model_pipeline с MLmodel/conda/reqs/model.pkl
        либо через log_model, либо через save_model + log_artifacts (fallback).
    """
    with mlflow.start_run(run_name=f"{model_name}_Final"):
        if params:
            mlflow.log_params(params)

        # fit → predict
        pipeline.fit(X_train_raw, y_train)
        y_pred = pipeline.predict(X_test_raw)

        # метрики + артефакты
        metrics = calc_metrics(y_test, y_pred)
        log_metrics_dict(metrics)
        save_predictions(y_test, y_pred, model_name)
        log_feature_importances(pipeline, model_name, feature_names)
        log_summary_text(model_name, params, metrics)

        # подпись и пример входа
        input_example = X_test_raw.iloc[:2].copy()
        try:
            signature = infer_signature(X_train_raw, pipeline.predict(X_train_raw.iloc[:2]))
        except Exception:
            signature = None

        # --- 1) Обычный путь (у твоего клиента работает только artifact_path) ---
        log_ok = False
        try:
            mlflow.sklearn.log_model(
                sk_model=pipeline,
                artifact_path="model_pipeline",  # ← ключевая правка
                input_example=input_example,
                signature=signature,
                pip_requirements=[
                    "mlflow",
                    "scikit-learn",
                    "pandas",
                    "numpy",
                    "xgboost",
                    "lightgbm",
                    "catboost",
                ],
            )
            log_ok = True
        except Exception:
            # чтобы видеть причину прямо в UI
            err_txt = "[log_model] failed:\n" + traceback.format_exc()
            with open("model_log_error.txt", "w", encoding="utf-8") as f:
                f.write(err_txt)
            mlflow.log_artifact("model_log_error.txt")
            print(err_txt)

        # --- 2) Надёжный fallback: сохраняем локально и загружаем папку как артефакты ---
        if not log_ok:
            try:
                with tempfile.TemporaryDirectory() as tmpdir:
                    local_dir = os.path.join(tmpdir, "model_pipeline")
                    mlflow.sklearn.save_model(
                        sk_model=pipeline,
                        path=local_dir,
                        input_example=input_example,
                        signature=signature,
                        pip_requirements=[
                            "mlflow",
                            "scikit-learn",
                            "pandas",
                            "numpy",
                            "xgboost",
                            "lightgbm",
                            "catboost",
                        ],
                    )
                    mlflow.log_artifacts(local_dir, artifact_path="model_pipeline")
                log_ok = True
            except Exception:
                err_txt = "[save_model/log_artifacts] fallback failed:\n" + traceback.format_exc()
                with open("model_save_fallback_error.txt", "w", encoding="utf-8") as f:
                    f.write(err_txt)
                mlflow.log_artifact("model_save_fallback_error.txt")
                print(err_txt)

        print(f"[{model_name}] run finished. model_pipeline logged: {log_ok}")
        print(
            f"🏃 View run {model_name}_Final at: "
            f"{mlflow.get_tracking_uri()}/#/experiments/{mlflow.active_run().info.experiment_id}"
            f"/runs/{mlflow.active_run().info.run_id}"
        )

In [10]:
best_xgb = tune_xgb(n_trials=3, timeout=300)
xgb_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("model", XGBRegressor(objective='reg:squarederror', n_jobs=-1, random_state=42, **best_xgb))
])
final_run("XGBoost_Optuna", xgb_pipe, best_xgb)

# LightGBM
best_lgbm = tune_lgbm(n_trials=3, timeout=300)
lgbm_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("model", LGBMRegressor(random_state=42, **best_lgbm))
])
final_run("LightGBM_Optuna", lgbm_pipe, best_lgbm)

# CatBoost
best_cat = tune_cat(n_trials=3, timeout=300)
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("model", CatBoostRegressor(verbose=0, random_state=42, **best_cat))
])
final_run("CatBoost_Optuna", cat_pipe, best_cat)

print("=== All done ===")

[I 2025-09-01 22:04:15,574] A new study created in memory with name: no-name-f363309f-c933-4bc2-b66d-be285f2034a4
[I 2025-09-01 22:04:16,116] Trial 0 finished with value: 2.78359871190345 and parameters: {'n_estimators': 179, 'learning_rate': 0.014747928982506812, 'max_depth': 3, 'subsample': 0.6660004190962207, 'colsample_bytree': 0.9846679329735166, 'reg_alpha': 0.4562000844424491, 'reg_lambda': 0.12740576177106844}. Best is trial 0 with value: 2.78359871190345.
[I 2025-09-01 22:04:16,734] Trial 1 finished with value: 1.7049174154956528 and parameters: {'n_estimators': 121, 'learning_rate': 0.05701492261656593, 'max_depth': 4, 'subsample': 0.9572066117333473, 'colsample_bytree': 0.8331538236564447, 'reg_alpha': 0.6457936851028607, 'reg_lambda': 0.08881548573217546}. Best is trial 1 with value: 1.7049174154956528.
[I 2025-09-01 22:04:17,391] Trial 2 finished with value: 1.6915009615019787 and parameters: {'n_estimators': 180, 'learning_rate': 0.08266511135663787, 'max_depth': 4, 'subs

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-09-01 22:04:23,462] A new study created in memory with name: no-name-5fee5e83-3043-4e1b-8126-156c48ea25b7


[XGBoost_Optuna] run finished. model_pipeline logged: True
🏃 View run XGBoost_Optuna_Final at: http://84.201.144.227:8000/#/experiments/8/runs/94ae8c757e82422c82f11493c3644ab3
🏃 View run XGBoost_Optuna_Final at: http://84.201.144.227:8000/#/experiments/8/runs/94ae8c757e82422c82f11493c3644ab3
🧪 View experiment at: http://84.201.144.227:8000/#/experiments/8


[I 2025-09-01 22:04:25,508] Trial 0 finished with value: 1.6293655599354573 and parameters: {'n_estimators': 274, 'learning_rate': 0.06687613950735954, 'num_leaves': 47, 'max_depth': 4, 'subsample': 0.840243536795265, 'colsample_bytree': 0.7563153100182392}. Best is trial 0 with value: 1.6293655599354573.
[I 2025-09-01 22:04:25,656] Trial 1 finished with value: 2.9201953340479414 and parameters: {'n_estimators': 102, 'learning_rate': 0.027638259217635384, 'num_leaves': 30, 'max_depth': 3, 'subsample': 0.6780897125215873, 'colsample_bytree': 0.6129877320478468}. Best is trial 0 with value: 1.6293655599354573.
[I 2025-09-01 22:04:26,032] Trial 2 finished with value: 2.2452132780586918 and parameters: {'n_estimators': 228, 'learning_rate': 0.05529916447594308, 'num_leaves': 47, 'max_depth': 5, 'subsample': 0.7577887640787003, 'colsample_bytree': 0.6209573317083622}. Best is trial 0 with value: 1.6293655599354573.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[I 2025-09-01 22:04:28,169] A new study created in memory with name: no-name-fd24c5aa-4589-44d1-a628-2918d16b9025


[LightGBM_Optuna] run finished. model_pipeline logged: True
🏃 View run LightGBM_Optuna_Final at: http://84.201.144.227:8000/#/experiments/8/runs/36bd89f46a6b43d4b5c649a6fb47c8ec
🏃 View run LightGBM_Optuna_Final at: http://84.201.144.227:8000/#/experiments/8/runs/36bd89f46a6b43d4b5c649a6fb47c8ec
🧪 View experiment at: http://84.201.144.227:8000/#/experiments/8


[I 2025-09-01 22:04:30,614] Trial 0 finished with value: 9.357086148482727 and parameters: {'iterations': 238, 'depth': 5, 'learning_rate': 0.043580966555352575, 'l2_leaf_reg': 3.1040296902047966}. Best is trial 0 with value: 9.357086148482727.
[I 2025-09-01 22:04:45,621] Trial 1 finished with value: 15.331509658344203 and parameters: {'iterations': 310, 'depth': 8, 'learning_rate': 0.0930312128731187, 'l2_leaf_reg': 4.518059146868752}. Best is trial 0 with value: 9.357086148482727.
[I 2025-09-01 22:04:50,407] Trial 2 finished with value: 12.597224599878032 and parameters: {'iterations': 339, 'depth': 6, 'learning_rate': 0.045993458187334996, 'l2_leaf_reg': 2.042574731619601}. Best is trial 0 with value: 9.357086148482727.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[CatBoost_Optuna] run finished. model_pipeline logged: True
🏃 View run CatBoost_Optuna_Final at: http://84.201.144.227:8000/#/experiments/8/runs/eb8a537b0c4e4d298883e790b2dac034
🏃 View run CatBoost_Optuna_Final at: http://84.201.144.227:8000/#/experiments/8/runs/eb8a537b0c4e4d298883e790b2dac034
🧪 View experiment at: http://84.201.144.227:8000/#/experiments/8
=== All done ===
