In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import optuna
import mlflow
import mlflow.sklearn
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:
    from pmdarima import auto_arima
    from statsmodels.tsa.arima.model import ARIMA
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    ARIMA_AVAILABLE = True
except ImportError:
    ARIMA_AVAILABLE = False
    print("Библиотеки для ARIMA/SARIMA (pmdarima, statsmodels) не найдены. Модели будут пропущены.")

In [3]:
MLFLOW_EXPERIMENT_NAME = "Financial_Gold_Prediction_With_ARIMA"
mlflow.set_tracking_uri("http://84.201.144.227:8000") 
print("MLflow URI установлен.")

MLflow URI установлен.


In [4]:
df = pd.read_csv('data/financial_regression.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df.dropna(subset=['gold close'], inplace=True)

# Создание временных признаков
df['year'] = df.index.year
df['month'] = df.index.month
df['dayofweek'] = df.index.dayofweek

In [5]:
df_fe = df.copy()
key_features = ['silver close', 'oil close', 'dxy close']
key_features = [col for col in key_features if col in df_fe.columns]

for col in key_features:
    df_fe[f'{col}_lag1'] = df_fe[col].shift(1)
    df_fe[f'{col}_roll_mean3'] = df_fe[col].rolling(window=3).mean()

df_fe[f'gold_close_lag1'] = df_fe['gold close'].shift(1)

In [6]:
y_with_lags = df_fe['gold close']
X_with_lags = df_fe.drop(columns=['gold close'])

mask_y_notna = y_with_lags.notna()
y_clean_for_split = y_with_lags[mask_y_notna]
X_clean_for_split = X_with_lags[mask_y_notna]

# Импутация
imputer = SimpleImputer(strategy='mean')
split_index = int(len(X_clean_for_split) * 0.8)
X_train_raw, X_test_raw = X_clean_for_split.iloc[:split_index], X_clean_for_split.iloc[split_index:]
y_train_raw, y_test_raw = y_clean_for_split.iloc[:split_index], y_clean_for_split.iloc[split_index:]

X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train_raw),
    columns=X_train_raw.columns,
    index=X_train_raw.index
)
X_test_imputed = pd.DataFrame(
    imputer.transform(X_test_raw),
    columns=X_test_raw.columns,
    index=X_test_raw.index
)

In [7]:
def calc_metrics(y_true, y_pred):
    mask = pd.notna(y_true) & pd.notna(y_pred)
    y_true_clean = y_true[mask]
    y_pred_clean = y_pred[mask]
    if len(y_true_clean) == 0:
        return {'mae': np.nan, 'mse': np.nan, 'rmse': np.nan, 'r2': np.nan, 'mape': np.nan}

    return {
        'mae': mean_absolute_error(y_true_clean, y_pred_clean),
        'mse': mean_squared_error(y_true_clean, y_pred_clean),
        'rmse': np.sqrt(mean_squared_error(y_true_clean, y_pred_clean)),
        'r2': r2_score(y_true_clean, y_pred_clean),
        'mape': mean_absolute_percentage_error(y_true_clean, y_pred_clean)
    }

def report_metrics(mlflow_obj, y_true, y_pred, model_name):
    """Адаптированная функция для логирования метрик регрессии в MLflow."""
    metrics = calc_metrics(y_true, y_pred)
    print(f"\n--- Метрики для {model_name} ---")
    for k, v in metrics.items():
        print(f"  {k.upper()}: {v:.4f}")
        if mlflow_obj: # Проверка, доступен ли MLflow
             mlflow_obj.log_metric(f"{k}_{model_name}", v)
    return metrics

def run_experiment(model, model_name, params, X_train, X_test, y_train, y_test):
    """Запуск эксперимента с логированием в MLflow."""
    print(f"\n--- Запуск эксперимента: {model_name} ---")
    
    try:
        mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
        with mlflow.start_run(run_name=model_name):
            # Логирование параметров
            if params:
                mlflow.log_params(params)
            
            # Обучение модели
            model.fit(X_train, y_train)
            
            # Предсказание
            preds = model.predict(X_test)
            
            # Вычисление и логирование метрик
            metrics = report_metrics(mlflow, y_test, preds, model_name)
            
            # Логирование модели (если это scikit-learn модель)
            if hasattr(model, 'predict'): # Простая проверка
                mlflow.sklearn.log_model(model, "model") 
            
            print(f"- {model_name}: Эксперимент завершен. -")
            return model, metrics

    except Exception as e:
        print(f"Ошибка при запуске эксперимента {model_name}: {e}")
        return None, None


In [8]:
y_full_series = df['gold close']
y_train_series = y_full_series.loc[X_train_imputed.index]
y_test_series = y_full_series.loc[X_test_imputed.index]


In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train_imputed.columns, index=X_train_imputed.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test_imputed.columns, index=X_test_imputed.index)

results = {} # Для хранения результатов

In [10]:
lr_model = LinearRegression()
lr_params = {"model": "LinearRegression"}
model_lr, metrics_lr = run_experiment(lr_model, "LinearRegression", lr_params, X_train_scaled_df, X_test_scaled_df, y_train_raw, y_test_raw)
if model_lr is not None:
    results["LinearRegression"] = metrics_lr


--- Запуск эксперимента: LinearRegression ---
Ошибка при запуске эксперимента LinearRegression: Cannot set a deleted experiment 'Financial_Gold_Prediction_With_ARIMA' as the active experiment. You can restore the experiment, or permanently delete the experiment to create a new one.


In [11]:
def objective_lgbm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 50),
        "max_depth": trial.suggest_int("max_depth", 3, 6),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "verbose": -1
    }
    rmses = []
    tscv = TimeSeriesSplit(n_splits=3)
    for train_idx, val_idx in tscv.split(X_train_imputed):
        model = LGBMRegressor(**params)
        model.fit(X_train_imputed.iloc[train_idx], y_train_raw.iloc[train_idx])
        preds = model.predict(X_train_imputed.iloc[val_idx])
        metrics = calc_metrics(y_train_raw.iloc[val_idx], preds)
        rmses.append(metrics['rmse'] if not np.isnan(metrics['rmse']) else np.inf)
    return np.mean(rmses) if rmses else np.inf

print("\nLightGBM: Подбор параметров с Optuna...")
study_lgbm = optuna.create_study(direction="minimize")
study_lgbm.optimize(objective_lgbm, n_trials=3, timeout=300) 
print("Лучшие параметры LightGBM:", study_lgbm.best_params)

best_params_lgbm = {**study_lgbm.best_params, 'verbose': -1}
final_model_lgbm = LGBMRegressor(**best_params_lgbm)
model_lgbm, metrics_lgbm = run_experiment(final_model_lgbm, "LightGBM_Optuna", best_params_lgbm, X_train_imputed, X_test_imputed, y_train_raw, y_test_raw)
if model_lgbm is not None:
    results["LightGBM"] = metrics_lgbm


[I 2025-08-31 11:58:53,758] A new study created in memory with name: no-name-04b96c27-375b-409e-89ef-afe980371220



LightGBM: Подбор параметров с Optuna...


[I 2025-08-31 11:58:55,951] Trial 0 finished with value: 1.5111584604978583 and parameters: {'n_estimators': 274, 'learning_rate': 0.040466462020454015, 'num_leaves': 23, 'max_depth': 6, 'subsample': 0.9942585153864425}. Best is trial 0 with value: 1.5111584604978583.
[I 2025-08-31 11:58:56,305] Trial 1 finished with value: 1.526522090060098 and parameters: {'n_estimators': 257, 'learning_rate': 0.0690512301528945, 'num_leaves': 45, 'max_depth': 4, 'subsample': 0.7587872380081007}. Best is trial 0 with value: 1.5111584604978583.
[I 2025-08-31 11:58:56,719] Trial 2 finished with value: 1.4602117709232356 and parameters: {'n_estimators': 230, 'learning_rate': 0.05120683943019833, 'num_leaves': 48, 'max_depth': 5, 'subsample': 0.8372404812707501}. Best is trial 2 with value: 1.4602117709232356.


Лучшие параметры LightGBM: {'n_estimators': 230, 'learning_rate': 0.05120683943019833, 'num_leaves': 48, 'max_depth': 5, 'subsample': 0.8372404812707501}

--- Запуск эксперимента: LightGBM_Optuna ---
Ошибка при запуске эксперимента LightGBM_Optuna: Cannot set a deleted experiment 'Financial_Gold_Prediction_With_ARIMA' as the active experiment. You can restore the experiment, or permanently delete the experiment to create a new one.


In [12]:
def objective_catboost(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 300),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 3, 6),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 5, log=True),
        "verbose": 0
    }
    rmses = []
    tscv = TimeSeriesSplit(n_splits=3)
    for train_idx, val_idx in tscv.split(X_train_imputed):
        model = CatBoostRegressor(**params)
        model.fit(X_train_imputed.iloc[train_idx], y_train_raw.iloc[train_idx], early_stopping_rounds=10, verbose=0)
        preds = model.predict(X_train_imputed.iloc[val_idx])
        metrics = calc_metrics(y_train_raw.iloc[val_idx], preds)
        rmses.append(metrics['rmse'] if not np.isnan(metrics['rmse']) else np.inf)
    return np.mean(rmses) if rmses else np.inf

print("\nCatBoost: Подбор параметров с Optuna...")
study_catboost = optuna.create_study(direction="minimize")
study_catboost.optimize(objective_catboost, n_trials=3, timeout=300)
print("Лучшие параметры CatBoost:", study_catboost.best_params)

best_params_catboost = {**study_catboost.best_params, 'verbose': 0}
final_model_catboost = CatBoostRegressor(**best_params_catboost)
model_catboost, metrics_catboost = run_experiment(final_model_catboost, "CatBoost_Optuna", best_params_catboost, X_train_imputed, X_test_imputed, y_train_raw, y_test_raw)
if model_catboost is not None:
    results["CatBoost"] = metrics_catboost

[I 2025-08-31 11:58:56,763] A new study created in memory with name: no-name-11afd2dd-f9e1-4152-aae2-afebb2493f55



CatBoost: Подбор параметров с Optuna...


[I 2025-08-31 11:58:58,182] Trial 0 finished with value: 5.580786524688123 and parameters: {'iterations': 292, 'learning_rate': 0.06692921931385888, 'depth': 3, 'l2_leaf_reg': 1.0572825647650832}. Best is trial 0 with value: 5.580786524688123.
[I 2025-08-31 11:59:01,465] Trial 1 finished with value: 13.895908802792235 and parameters: {'iterations': 266, 'learning_rate': 0.012859902538182161, 'depth': 6, 'l2_leaf_reg': 1.9764200268490992}. Best is trial 0 with value: 5.580786524688123.
[I 2025-08-31 11:59:04,985] Trial 2 finished with value: 12.262269610327635 and parameters: {'iterations': 280, 'learning_rate': 0.06479247378011803, 'depth': 6, 'l2_leaf_reg': 2.0977369268718897}. Best is trial 0 with value: 5.580786524688123.


Лучшие параметры CatBoost: {'iterations': 292, 'learning_rate': 0.06692921931385888, 'depth': 3, 'l2_leaf_reg': 1.0572825647650832}

--- Запуск эксперимента: CatBoost_Optuna ---
Ошибка при запуске эксперимента CatBoost_Optuna: Cannot set a deleted experiment 'Financial_Gold_Prediction_With_ARIMA' as the active experiment. You can restore the experiment, or permanently delete the experiment to create a new one.


In [13]:
def objective_xgb(trial):
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    }
    rmses = []
    tscv = TimeSeriesSplit(n_splits=3)
    for train_idx, val_idx in tscv.split(X_train_imputed):
        model = XGBRegressor(**params)
        model.fit(X_train_imputed.iloc[train_idx], y_train_raw.iloc[train_idx], verbose=0)
        preds = model.predict(X_train_imputed.iloc[val_idx])
        metrics = calc_metrics(y_train_raw.iloc[val_idx], preds)
        rmses.append(metrics['rmse'] if not np.isnan(metrics['rmse']) else np.inf)
    return np.mean(rmses) if rmses else np.inf

print("\nXGBoost: Подбор параметров с Optuna...")
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=3, timeout=300)
print("Лучшие параметры XGBoost:", study_xgb.best_params)

best_params_xgb = study_xgb.best_params
final_model_xgb = XGBRegressor(**best_params_xgb)
model_xgb, metrics_xgb = run_experiment(final_model_xgb, "XGBoost_Optuna", best_params_xgb, X_train_imputed, X_test_imputed, y_train_raw, y_test_raw)
if model_xgb is not None:
    results["XGBoost"] = metrics_xgb

[I 2025-08-31 11:59:05,027] A new study created in memory with name: no-name-ce6c4c35-8449-4b59-beef-9c4995d3a25a



XGBoost: Подбор параметров с Optuna...


[I 2025-08-31 11:59:06,219] Trial 0 finished with value: 5.282350953162805 and parameters: {'n_estimators': 153, 'learning_rate': 0.011435449395281815, 'max_depth': 6, 'subsample': 0.7943108257636885, 'colsample_bytree': 0.9404098003082748}. Best is trial 0 with value: 5.282350953162805.
[I 2025-08-31 11:59:07,200] Trial 1 finished with value: 1.8420445479454288 and parameters: {'n_estimators': 176, 'learning_rate': 0.04534625006623747, 'max_depth': 5, 'subsample': 0.7267727366557938, 'colsample_bytree': 0.7033346165251708}. Best is trial 1 with value: 1.8420445479454288.
[I 2025-08-31 11:59:08,779] Trial 2 finished with value: 2.204995355715873 and parameters: {'n_estimators': 196, 'learning_rate': 0.031974950721968785, 'max_depth': 6, 'subsample': 0.6782038886975436, 'colsample_bytree': 0.6162734478962333}. Best is trial 1 with value: 1.8420445479454288.


Лучшие параметры XGBoost: {'n_estimators': 176, 'learning_rate': 0.04534625006623747, 'max_depth': 5, 'subsample': 0.7267727366557938, 'colsample_bytree': 0.7033346165251708}

--- Запуск эксперимента: XGBoost_Optuna ---
Ошибка при запуске эксперимента XGBoost_Optuna: Cannot set a deleted experiment 'Financial_Gold_Prediction_With_ARIMA' as the active experiment. You can restore the experiment, or permanently delete the experiment to create a new one.


In [14]:
# print(f"\n--- Подготовка данных для ARIMA/SARIMA ---")
# print(f"Размер обучающего ряда: {y_train_series.shape}")
# print(f"Размер тестового ряда: {y_test_series.shape}")

# if ARIMA_AVAILABLE:
#     # --- Модель 5: ARIMA ---
#     print("\n--- Обучение ARIMA ---")
#     try:
#         # Используем фиксированные параметры для демонстрации
#         order = (1, 1, 1) # (p, d, q) - пример, подберите под свои данные
#         print(f"Используем фиксированные параметры ARIMA: {order}")
        
#         # Прогноз на тестовую выборку (walk-forward)
#         forecast_arima = []
#         history = list(y_train_series) # История начинается с обучающих данных
#         for t in range(len(y_test_series)):
#             model_temp = ARIMA(history, order=order).fit()
#             yhat = model_temp.forecast(steps=1)[0]
#             forecast_arima.append(yhat)
#             # Добавляем фактическое значение из теста в историю (walk-forward)
#             history.append(y_test_series.iloc[t]) 
        
#         forecast_arima = pd.Series(forecast_arima, index=y_test_series.index)
        
#         # Оценка ARIMA и логирование в MLflow
#         print(f"\n--- Запуск эксперимента: ARIMA ---")
#         try:
#             mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
#             with mlflow.start_run(run_name="ARIMA"):
#                 mlflow.log_params({'order': str(order)}) # Преобразуем tuple в строку для логирования
#                 metrics_arima = report_metrics(mlflow, y_test_series, forecast_arima, "ARIMA")
#                 # Модель ARIMA не сохраняется как sklearn, поэтому пропускаем log_model
#                 print(f"- ARIMA: Эксперимент завершен. -")
#                 results["ARIMA"] = metrics_arima
#         except Exception as e:
#             print(f"Ошибка при логировании ARIMA в MLflow: {e}")
#             # В случае ошибки MLflow, все равно сохраняем результаты
#             metrics_arima = calc_metrics(y_test_series, forecast_arima)
#             print(f"\n--- Метрики для ARIMA ---")
#             for k, v in metrics_arima.items():
#                 print(f"  {k.upper()}: {v:.4f}")
#             results["ARIMA"] = metrics_arima
            
#     except Exception as e:
#         print(f"Ошибка при обучении/оценке ARIMA: {e}")
#         results["ARIMA"] = {"mae": np.nan, "mse": np.nan, "rmse": np.nan, "r2": np.nan, "mape": np.nan}


#     # --- Модель 6: SARIMA ---
#     print("\n--- Обучение SARIMA ---")
#     try:
#         # Используем фиксированные параметры для демонстрации
#         order_s = (1, 1, 1) # (p, d, q)
#         seasonal_order = (1, 1, 1, 12) # (P, D, Q, s) - s=12 для годовой сезонности
#         print(f"Используем фиксированные параметры SARIMA: {order_s} x {seasonal_order}")
        
#         # Прогноз на тестовую выборку (walk-forward)
#         forecast_sarima = []
#         history_s = list(y_train_series)
#         for t in range(len(y_test_series)):
#             model_temp_s = SARIMAX(history_s, order=order_s, seasonal_order=seasonal_order).fit(disp=False)
#             yhat_s = model_temp_s.forecast(steps=1)[0]
#             forecast_sarima.append(yhat_s)
#             history_s.append(y_test_series.iloc[t]) 
        
#         forecast_sarima = pd.Series(forecast_sarima, index=y_test_series.index)
        
#         # Оценка SARIMA и логирование в MLflow
#         print(f"\n--- Запуск эксперимента: SARIMA ---")
#         try:
#             mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
#             with mlflow.start_run(run_name="SARIMA"):
#                 mlflow.log_params({
#                     'order': str(order_s),
#                     'seasonal_order': str(seasonal_order)
#                 })
#                 metrics_sarima = report_metrics(mlflow, y_test_series, forecast_sarima, "SARIMA")
#                 # Модель SARIMA не сохраняется как sklearn, поэтому пропускаем log_model
#                 print(f"- SARIMA: Эксперимент завершен. -")
#                 results["SARIMA"] = metrics_sarima
#         except Exception as e:
#             print(f"Ошибка при логировании SARIMA в MLflow: {e}")
#             # В случае ошибки MLflow, все равно сохраняем результаты
#             metrics_sarima = calc_metrics(y_test_series, forecast_sarima)
#             print(f"\n--- Метрики для SARIMA ---")
#             for k, v in metrics_sarima.items():
#                 print(f"  {k.upper()}: {v:.4f}")
#             results["SARIMA"] = metrics_sarima
            
#     except Exception as e:
#         print(f"Ошибка при обучении/оценке SARIMA: {e}")
#         results["SARIMA"] = {"mae": np.nan, "mse": np.nan, "rmse": np.nan, "r2": np.nan, "mape": np.nan}

# else:
#     print("\nARIMA/SARIMA модели пропущены из-за отсутствия необходимых библиотек.")


In [15]:
print("\n=========================================")
print("          СВОДКА РЕЗУЛЬТАТОВ             ")
print("=========================================")
if results:
    results_df = pd.DataFrame(results).T
    print(results_df.round(4))
    if not results_df.empty:
        best_model_name = results_df['rmse'].idxmin()
        best_rmse = results_df.loc[best_model_name, 'rmse']
        print(f"\nЛучшая модель по RMSE: {best_model_name} (RMSE = {best_rmse:.4f})")
else:
    print("Нет результатов для отображения.")

print("\n--- Все эксперименты завершены ---")
print("Проверьте MLflow UI по адресу http://84.201.144.227:8000")


          СВОДКА РЕЗУЛЬТАТОВ             
Нет результатов для отображения.

--- Все эксперименты завершены ---
Проверьте MLflow UI по адресу http://84.201.144.227:8000
