In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline ### –ò–ó–ú–ï–ù–ï–ù–ò–ï 1: –î–æ–±–∞–≤–ª–µ–Ω –∏–º–ø–æ—Ä—Ç Pipeline
import optuna
import mlflow
import mlflow.sklearn
import mlflow.pyfunc ### –ò–ó–ú–ï–ù–ï–ù–ò–ï 2: –î–æ–±–∞–≤–ª–µ–Ω –∏–º–ø–æ—Ä—Ç pyfunc
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

warnings.filterwarnings('ignore')

In [2]:
MLFLOW_EXPERIMENT_NAME = "Financial_Gold_Prediction_v2" # –ù–æ–≤–æ–µ –∏–º—è —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
mlflow.set_tracking_uri("http://84.201.144.227:8000") 
print("MLflow URI —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω.")

MLflow URI —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω.


In [3]:
df = pd.read_csv('data/financial_regression.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df.dropna(subset=['gold close'], inplace=True)
df['year'] = df.index.year
df['month'] = df.index.month
df['dayofweek'] = df.index.dayofweek

In [4]:
df_fe = df.copy()
key_features = ['silver close', 'oil close', 'dxy close']
key_features = [col for col in key_features if col in df_fe.columns]
for col in key_features:
    df_fe[f'{col}_lag1'] = df_fe[col].shift(1)
    df_fe[f'{col}_roll_mean3'] = df_fe[col].rolling(window=3).mean()
df_fe[f'gold_close_lag1'] = df_fe['gold close'].shift(1)
y_with_lags = df_fe['gold close']
X_with_lags = df_fe.drop(columns=['gold close'])
mask_y_notna = y_with_lags.notna()
y_clean_for_split = y_with_lags[mask_y_notna]
X_clean_for_split = X_with_lags[mask_y_notna]

In [5]:
imputer_global = SimpleImputer(strategy='mean') # –ò—Å–ø–æ–ª—å–∑—É–µ–º –æ–¥–∏–Ω –∏–º–ø—å—é—Ç–µ—Ä
split_index = int(len(X_clean_for_split) * 0.8)
X_train_raw, X_test_raw = X_clean_for_split.iloc[:split_index], X_clean_for_split.iloc[split_index:]
y_train_raw, y_test_raw = y_clean_for_split.iloc[:split_index], y_clean_for_split.iloc[split_index:]
X_train_imputed = pd.DataFrame(imputer_global.fit_transform(X_train_raw), columns=X_train_raw.columns, index=X_train_raw.index)
X_test_imputed = pd.DataFrame(imputer_global.transform(X_test_raw), columns=X_test_raw.columns, index=X_test_raw.index)

In [6]:
def calc_metrics(y_true, y_pred):
    mask = pd.notna(y_true) & pd.notna(y_pred)
    y_true_clean, y_pred_clean = y_true[mask], y_pred[mask]
    if len(y_true_clean) == 0: return {'mae': np.nan, 'mse': np.nan, 'rmse': np.nan, 'r2': np.nan, 'mape': np.nan}
    return {'mae': mean_absolute_error(y_true_clean, y_pred_clean), 'mse': mean_squared_error(y_true_clean, y_pred_clean),
            'rmse': np.sqrt(mean_squared_error(y_true_clean, y_pred_clean)), 'r2': r2_score(y_true_clean, y_pred_clean),
            'mape': mean_absolute_percentage_error(y_true_clean, y_pred_clean)}
def report_metrics(mlflow_obj, y_true, y_pred, model_name):
    metrics = calc_metrics(y_true, y_pred)
    print(f"\n--- –ú–µ—Ç—Ä–∏–∫–∏ –¥–ª—è {model_name} ---")
    for k, v in metrics.items():
        print(f"  {k.upper()}: {v:.4f}")
        if mlflow_obj: mlflow_obj.log_metric(f"{k}_{model_name}", v)
    return metrics


In [7]:
class ModelPipelineWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, pipeline):
        self.pipeline = pipeline
    def predict(self, context, model_input):
        return self.pipeline.predict(model_input)

def run_experiment(model, model_name, params, X_train, y_train, X_test, y_test):
    """–ó–∞–ø—É—Å–∫ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞ —Å –ì–ê–†–ê–ù–¢–ò–†–û–í–ê–ù–ù–´–ú –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ–º –º–æ–¥–µ–ª–∏ —á–µ—Ä–µ–∑ pyfunc."""
    print(f"\n--- –ó–∞–ø—É—Å–∫ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞: {model_name} ---")
    
    # –ú—ã –±—É–¥–µ–º –ª–æ–≥–∏—Ä–æ–≤–∞—Ç—å –ø–∞–π–ø–ª–∞–π–Ω (imputer + model)
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')), # –°–æ–∑–¥–∞–µ–º –ø–∞–π–ø–ª–∞–π–Ω –∑–¥–µ—Å—å
        ('model', model)
    ])
    
    try:
        mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
        with mlflow.start_run(run_name=model_name) as run:
            if params:
                mlflow.log_params(params)
            
            # –û–±—É—á–∞–µ–º –ø–∞–π–ø–ª–∞–π–Ω
            pipeline.fit(X_train, y_train)
            
            preds = pipeline.predict(X_test)
            metrics = report_metrics(mlflow, y_test, preds, model_name)
            
            # --- –ì–ê–†–ê–ù–¢–ò–†–û–í–ê–ù–ù–û–ï –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –ú–û–î–ï–õ–ò ---
            print("–õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ —á–µ—Ä–µ–∑ mlflow.pyfunc...")
            wrapped_model = ModelPipelineWrapper(pipeline)
            
            # –°–æ–±–∏—Ä–∞–µ–º –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏
            pip_requirements = [
                'mlflow', 'scikit-learn', 'pandas', 'numpy',
                'xgboost', 'lightgbm', 'catboost', 'cloudpickle'
            ]
            
            mlflow.pyfunc.log_model(
                artifact_path="model_pipeline", # <- –í–∞–∂–Ω–æ–µ –∏–º—è –ø–∞–ø–∫–∏
                python_model=wrapped_model,
                pip_requirements=pip_requirements,
                input_example=X_train.head(5) # –î–æ–±–∞–≤–ª—è–µ–º –ø—Ä–∏–º–µ—Ä –¥–ª—è —Å—Ö–µ–º—ã
            )
            print(f"‚úÖ –ú–æ–¥–µ–ª—å '{model_name}' —É—Å–ø–µ—à–Ω–æ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞.")
            
            # –°–æ—Ö—Ä–∞–Ω—è–µ–º RUN_ID –¥–ª—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –≤ API
            run_id = run.info.run_id
            with open(f"{model_name}_run_id.txt", "w") as f:
                f.write(run_id)
            mlflow.log_artifact(f"{model_name}_run_id.txt")
            print(f"RUN_ID –¥–ª—è —ç—Ç–æ–π –º–æ–¥–µ–ª–∏: {run_id}")

            return pipeline, metrics

    except Exception as e:
        print(f"‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –∑–∞–ø—É—Å–∫–µ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞ {model_name}: {e}")
        import traceback
        traceback.print_exc()
        return None, None



In [8]:
y_full_series = df['gold close']
y_train_series = y_full_series.loc[X_train_imputed.index]
y_test_series = y_full_series.loc[X_test_imputed.index]


In [9]:
results = {} # –î–ª—è —Ö—Ä–∞–Ω–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

In [10]:
def objective_lgbm(trial):
    params = {"n_estimators": trial.suggest_int("n_estimators", 100, 300), "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
              "num_leaves": trial.suggest_int("num_leaves", 20, 50), "max_depth": trial.suggest_int("max_depth", 3, 6),
              "subsample": trial.suggest_float("subsample", 0.6, 1.0), "verbose": -1}
    model = LGBMRegressor(**params)
    pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('model', model)])
    pipe.fit(X_train_raw, y_train_raw)
    preds = pipe.predict(X_test_raw)
    metrics = calc_metrics(y_test_raw, preds)
    return metrics['rmse'] if not np.isnan(metrics['rmse']) else np.inf

print("\nLightGBM: –ü–æ–¥–±–æ—Ä –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ —Å Optuna...")
study_lgbm = optuna.create_study(direction="minimize")
study_lgbm.optimize(objective_lgbm, n_trials=3, timeout=300) 
best_params_lgbm = {**study_lgbm.best_params, 'verbose': -1}
final_model_lgbm = LGBMRegressor(**best_params_lgbm)
model_lgbm, metrics_lgbm = run_experiment(final_model_lgbm, "LightGBM_Optuna", best_params_lgbm, X_train_raw, y_train_raw, X_test_raw, y_test_raw)
if model_lgbm is not None:
    results["LightGBM"] = metrics_lgbm



[I 2025-09-01 23:58:24,490] A new study created in memory with name: no-name-caccfb5f-ae17-42c5-97ae-43ea3337056c



LightGBM: –ü–æ–¥–±–æ—Ä –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ —Å Optuna...


[I 2025-09-01 23:58:26,406] Trial 0 finished with value: 18.594644030876733 and parameters: {'n_estimators': 282, 'learning_rate': 0.029169811614400677, 'num_leaves': 33, 'max_depth': 6, 'subsample': 0.9412549345775878}. Best is trial 0 with value: 18.594644030876733.
[I 2025-09-01 23:58:26,501] Trial 1 finished with value: 18.64718688619543 and parameters: {'n_estimators': 155, 'learning_rate': 0.03250242018391508, 'num_leaves': 44, 'max_depth': 3, 'subsample': 0.867472560212041}. Best is trial 0 with value: 18.594644030876733.
[I 2025-09-01 23:58:26,635] Trial 2 finished with value: 18.51851138688786 and parameters: {'n_estimators': 254, 'learning_rate': 0.026215681949906058, 'num_leaves': 35, 'max_depth': 3, 'subsample': 0.8144085423729818}. Best is trial 2 with value: 18.51851138688786.
2025/09/01 23:58:26 INFO mlflow.tracking.fluent: Experiment with name 'Financial_Gold_Prediction_v2' does not exist. Creating a new experiment.



--- –ó–∞–ø—É—Å–∫ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞: LightGBM_Optuna ---

--- –ú–µ—Ç—Ä–∏–∫–∏ –¥–ª—è LightGBM_Optuna ---
  MAE: 8.8641
  MSE: 342.9353
  RMSE: 18.5185
  R2: 0.3207
  MAPE: 0.0399
–õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ —á–µ—Ä–µ–∑ mlflow.pyfunc...


2025/09/01 23:58:27 INFO mlflow.pyfunc: Inferring model signature from input example


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

‚úÖ –ú–æ–¥–µ–ª—å 'LightGBM_Optuna' —É—Å–ø–µ—à–Ω–æ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞.
RUN_ID –¥–ª—è —ç—Ç–æ–π –º–æ–¥–µ–ª–∏: e71ac7626f8548edab1a342a26e15766
üèÉ View run LightGBM_Optuna at: http://84.201.144.227:8000/#/experiments/10/runs/e71ac7626f8548edab1a342a26e15766
üß™ View experiment at: http://84.201.144.227:8000/#/experiments/10


In [11]:
def objective_catboost(trial):
    params = {"iterations": trial.suggest_int("iterations", 100, 300), "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
              "depth": trial.suggest_int("depth", 3, 6), "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 5, log=True), "verbose": 0}
    model = CatBoostRegressor(**params)
    pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('model', model)])
    pipe.fit(X_train_raw, y_train_raw)
    preds = pipe.predict(X_test_raw)
    metrics = calc_metrics(y_test_raw, preds)
    return metrics['rmse'] if not np.isnan(metrics['rmse']) else np.inf

print("\nCatBoost: –ü–æ–¥–±–æ—Ä –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ —Å Optuna...")
study_catboost = optuna.create_study(direction="minimize")
study_catboost.optimize(objective_catboost, n_trials=3, timeout=300)
best_params_catboost = {**study_catboost.best_params, 'verbose': 0}
final_model_catboost = CatBoostRegressor(**best_params_catboost)
model_catboost, metrics_catboost = run_experiment(final_model_catboost, "CatBoost_Optuna", best_params_catboost, X_train_raw, y_train_raw, X_test_raw, y_test_raw)
if model_catboost is not None:
    results["CatBoost"] = metrics_catboost

[I 2025-09-01 23:58:31,917] A new study created in memory with name: no-name-d6ad6c1c-1927-41ab-9be3-575f41c33f38



CatBoost: –ü–æ–¥–±–æ—Ä –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ —Å Optuna...


[I 2025-09-01 23:58:32,866] Trial 0 finished with value: 21.809602048170145 and parameters: {'iterations': 252, 'learning_rate': 0.020214553356790094, 'depth': 5, 'l2_leaf_reg': 3.423987473922204}. Best is trial 0 with value: 21.809602048170145.
[I 2025-09-01 23:58:34,276] Trial 1 finished with value: 19.905133854386545 and parameters: {'iterations': 232, 'learning_rate': 0.09029065797789809, 'depth': 6, 'l2_leaf_reg': 1.9058785297010918}. Best is trial 1 with value: 19.905133854386545.
[I 2025-09-01 23:58:34,779] Trial 2 finished with value: 20.03023612502932 and parameters: {'iterations': 201, 'learning_rate': 0.056452081921453984, 'depth': 4, 'l2_leaf_reg': 1.099557678619075}. Best is trial 1 with value: 19.905133854386545.



--- –ó–∞–ø—É—Å–∫ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞: CatBoost_Optuna ---


2025/09/01 23:58:36 INFO mlflow.pyfunc: Inferring model signature from input example



--- –ú–µ—Ç—Ä–∏–∫–∏ –¥–ª—è CatBoost_Optuna ---
  MAE: 10.8037
  MSE: 396.2144
  RMSE: 19.9051
  R2: 0.2152
  MAPE: 0.0501
–õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ —á–µ—Ä–µ–∑ mlflow.pyfunc...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

‚úÖ –ú–æ–¥–µ–ª—å 'CatBoost_Optuna' —É—Å–ø–µ—à–Ω–æ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞.
RUN_ID –¥–ª—è —ç—Ç–æ–π –º–æ–¥–µ–ª–∏: f1262bc2f6414c729dbd57a885dc033f
üèÉ View run CatBoost_Optuna at: http://84.201.144.227:8000/#/experiments/10/runs/f1262bc2f6414c729dbd57a885dc033f
üß™ View experiment at: http://84.201.144.227:8000/#/experiments/10


In [12]:
def objective_xgb(trial):
    params = {'objective': 'reg:squarederror', 'n_estimators': trial.suggest_int('n_estimators', 100, 300),
              'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True), 'max_depth': trial.suggest_int('max_depth', 3, 6),
              'subsample': trial.suggest_float('subsample', 0.6, 1.0), 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)}
    model = XGBRegressor(**params)
    pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('model', model)])
    pipe.fit(X_train_raw, y_train_raw)
    preds = pipe.predict(X_test_raw)
    metrics = calc_metrics(y_test_raw, preds)
    return metrics['rmse'] if not np.isnan(metrics['rmse']) else np.inf

print("\nXGBoost: –ü–æ–¥–±–æ—Ä –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ —Å Optuna...")
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=3, timeout=300)
best_params_xgb = study_xgb.best_params
final_model_xgb = XGBRegressor(**best_params_xgb)
model_xgb, metrics_xgb = run_experiment(final_model_xgb, "XGBoost_Optuna", best_params_xgb, X_train_raw, y_train_raw, X_test_raw, y_test_raw)
if model_xgb is not None:
    results["XGBoost"] = metrics_xgb

[I 2025-09-01 23:58:37,161] A new study created in memory with name: no-name-752189bf-f621-4909-9018-fcbd4786fe55



XGBoost: –ü–æ–¥–±–æ—Ä –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ —Å Optuna...


[I 2025-09-01 23:58:37,382] Trial 0 finished with value: 18.18820210617311 and parameters: {'n_estimators': 123, 'learning_rate': 0.043587289889896776, 'max_depth': 4, 'subsample': 0.8392521026834094, 'colsample_bytree': 0.614211066500788}. Best is trial 0 with value: 18.18820210617311.
[I 2025-09-01 23:58:37,705] Trial 1 finished with value: 18.011829838104674 and parameters: {'n_estimators': 138, 'learning_rate': 0.05535073387760788, 'max_depth': 5, 'subsample': 0.6614423904239255, 'colsample_bytree': 0.844667413729015}. Best is trial 1 with value: 18.011829838104674.
[I 2025-09-01 23:58:38,337] Trial 2 finished with value: 18.436301173614442 and parameters: {'n_estimators': 145, 'learning_rate': 0.0315634562791395, 'max_depth': 6, 'subsample': 0.9621398688619633, 'colsample_bytree': 0.9057907436643964}. Best is trial 1 with value: 18.011829838104674.



--- –ó–∞–ø—É—Å–∫ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞: XGBoost_Optuna ---

--- –ú–µ—Ç—Ä–∏–∫–∏ –¥–ª—è XGBoost_Optuna ---
  MAE: 8.5694
  MSE: 324.4260
  RMSE: 18.0118
  R2: 0.3574
  MAPE: 0.0386
–õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ —á–µ—Ä–µ–∑ mlflow.pyfunc...


2025/09/01 23:58:39 INFO mlflow.pyfunc: Inferring model signature from input example


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

‚úÖ –ú–æ–¥–µ–ª—å 'XGBoost_Optuna' —É—Å–ø–µ—à–Ω–æ –∑–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∞.
RUN_ID –¥–ª—è —ç—Ç–æ–π –º–æ–¥–µ–ª–∏: 82d0a09af0d144f3bdc3f7111ea5b099
üèÉ View run XGBoost_Optuna at: http://84.201.144.227:8000/#/experiments/10/runs/82d0a09af0d144f3bdc3f7111ea5b099
üß™ View experiment at: http://84.201.144.227:8000/#/experiments/10


In [13]:
print("\n=========================================\n          –°–í–û–î–ö–ê –†–ï–ó–£–õ–¨–¢–ê–¢–û–í\n=========================================")
if results:
    results_df = pd.DataFrame(results).T
    print(results_df.round(4))
    if not results_df.empty:
        best_model_name = results_df['rmse'].idxmin()
        best_rmse = results_df.loc[best_model_name, 'rmse']
        print(f"\n–õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –ø–æ RMSE: {best_model_name} (RMSE = {best_rmse:.4f})")
else:
    print("–ù–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –¥–ª—è –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏—è.")
print("\n--- –í—Å–µ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç—ã –∑–∞–≤–µ—Ä—à–µ–Ω—ã ---")
print("–ü—Ä–æ–≤–µ—Ä—å—Ç–µ MLflow UI –ø–æ –∞–¥—Ä–µ—Å—É http://84.201.144.227:8000")


          –°–í–û–î–ö–ê –†–ï–ó–£–õ–¨–¢–ê–¢–û–í
              mae       mse     rmse      r2    mape
LightGBM   8.8641  342.9353  18.5185  0.3207  0.0399
CatBoost  10.8037  396.2144  19.9051  0.2152  0.0501
XGBoost    8.5694  324.4260  18.0118  0.3574  0.0386

–õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –ø–æ RMSE: XGBoost (RMSE = 18.0118)

--- –í—Å–µ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç—ã –∑–∞–≤–µ—Ä—à–µ–Ω—ã ---
–ü—Ä–æ–≤–µ—Ä—å—Ç–µ MLflow UI –ø–æ –∞–¥—Ä–µ—Å—É http://84.201.144.227:8000
