In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load cleaned feature dataset
df = pd.read_csv("../data/processed/modeling_dataset.csv", index_col=0, parse_dates=True)

# Show preview
print(df.head())

                gold    eurusd  treasury_10y       spy       vix       dxy  \
Date                                                                         
2004-01-13 -0.006099  0.003529         4.028 -0.005830  0.072533 -0.000468   
2004-01-14 -0.004720 -0.010551         3.986  0.008351 -0.071508  0.006081   
2004-01-15 -0.031539 -0.003957         3.971  0.002468 -0.071045  0.006161   
2004-01-16 -0.004163 -0.016105         4.014  0.003955 -0.035990  0.013517   
2004-01-19  0.000000 -0.003546         4.014  0.000000  0.000000 -0.000798   

                 oil    target  gold_lag1  gold_lag2  ...  vix_lag3  dxy_lag1  \
Date                                                  ...                       
2004-01-13 -0.008353 -0.004720  -0.000235   0.005660  ...  0.007097  0.005170   
2004-01-14  0.002033 -0.031539  -0.006099  -0.000235  ...  0.073030 -0.000468   
2004-01-15 -0.030725 -0.004163  -0.004720  -0.006099  ...  0.004179  0.006081   
2004-01-16  0.048744  0.000000  -0.031539  -0.00

# Baseline Models & Evaluation Metrics

Simple train-test split: 2003-2015 train, 2016 test (first true out-of-sample year)

In [None]:
# Train-test split: 2003-2015 train, 2016 test
# 2016 is the first true out-of-sample year (all models will be tested on this year)
split_date = '2016-01-01'

train = df[df.index < split_date].copy()
test = df[(df.index >= split_date) & (df.index < '2017-01-01')].copy()

print(f"Training set: {train.index[0].date()} to {train.index[-1].date()} ({len(train)} rows)")
print(f"Test set: {test.index[0].date()} to {test.index[-1].date()} ({len(test)} rows)")

# Separate features and target
X_train = train.drop('target', axis=1)
y_train = train['target']
X_test = test.drop('target', axis=1)
y_test = test['target']

print(f"\nFeatures: {X_train.shape[1]}, Train samples: {len(X_train)}, Test samples: {len(X_test)}")


Training set: 2004-01-13 to 2015-12-31 (3123 rows)
Test set: 2016-01-01 to 2016-12-30 (261 rows)

Features: 42, Train samples: 3123, Test samples: 261


# Baseline 1: Naive Forecast

Predicts tomorrow's return = today's return (simplest possible baseline)

In [22]:
# Naive baseline: use gold_lag1 feature (yesterday's return)
# This properly uses the feature without leaking future data
y_pred_naive = X_test['gold_lag1'].copy()
y_test_aligned = y_test

rmse_naive = np.sqrt(mean_squared_error(y_test_aligned, y_pred_naive))
mae_naive = mean_absolute_error(y_test_aligned, y_pred_naive)

# Directional accuracy: % of time sign matches (up/down prediction)
correct_dir = (np.sign(y_pred_naive) == np.sign(y_test_aligned)).sum()
dir_acc_naive = correct_dir / len(y_test_aligned) * 100

print("=" * 60)
print("NAIVE BASELINE (using gold_lag1 feature)")
print("=" * 60)
print(f"RMSE: {rmse_naive:.6f}")
print(f"MAE:  {mae_naive:.6f}")
print(f"Directional Accuracy: {dir_acc_naive:.2f}%")
print(f"Test samples: {len(y_test_aligned)}")


NAIVE BASELINE (using gold_lag1 feature)
RMSE: 0.013904
MAE:  0.010316
Directional Accuracy: 45.98%
Test samples: 261


# Baseline 2: Simple Moving Average (SMA)

Predicts using 20-day moving average of returns (simple trend-following)

In [None]:
# Simple Moving Average baseline: use static 20-day SMA from training set
# Every test prediction equals the last 20-day SMA calculated on training data

sma_window = 20
last_sma = y_train.rolling(window=sma_window).mean().iloc[-1]

# Predict constant SMA value for all test samples
y_pred_sma = pd.Series([last_sma] * len(y_test), index=y_test.index)

rmse_sma = np.sqrt(mean_squared_error(y_test, y_pred_sma))
mae_sma = mean_absolute_error(y_test, y_pred_sma)

# Directional accuracy
correct_dir_sma = (np.sign(y_pred_sma) == np.sign(y_test)).sum()
dir_acc_sma = correct_dir_sma / len(y_test) * 100

print("=" * 60)
print("SIMPLE MOVING AVERAGE (20-day, static from training)")
print("=" * 60)
print(f"SMA Value: {last_sma:.6f}")
print(f"RMSE: {rmse_sma:.6f}")
print(f"MAE:  {mae_sma:.6f}")
print(f"Directional Accuracy: {dir_acc_sma:.2f}%")
print(f"Test samples: {len(y_test)}")


SIMPLE MOVING AVERAGE (20-day, updated daily)
RMSE: 0.010212
MAE:  0.007377
Directional Accuracy: 45.59%
Test samples: 261


# Baseline 3: ARIMA Model

Univariate autoregressive integrated moving average - statistical time-series forecasting

In [24]:
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings('ignore')

# ARIMA(1,0,1) baseline: fit on training data, forecast on test set
try:
    model_arima = ARIMA(y_train, order=(1, 0, 1))
    results_arima = model_arima.fit()
    
    # Forecast on test set
    y_pred_arima = results_arima.get_forecast(steps=len(y_test)).predicted_mean
    
    rmse_arima = np.sqrt(mean_squared_error(y_test, y_pred_arima))
    mae_arima = mean_absolute_error(y_test, y_pred_arima)
    
    # Directional accuracy
    correct_dir_arima = (np.sign(y_pred_arima) == np.sign(y_test)).sum()
    dir_acc_arima = correct_dir_arima / len(y_test) * 100
    
    print("=" * 60)
    print("ARIMA(1,0,1)")
    print("=" * 60)
    print(f"RMSE: {rmse_arima:.6f}")
    print(f"MAE:  {mae_arima:.6f}")
    print(f"Directional Accuracy: {dir_acc_arima:.2f}%")
    print(f"Test samples: {len(y_test)}")
    
except Exception as e:
    print(f"ARIMA fit error: {e}")
    print("Skipping ARIMA - may need parameter tuning")


ARIMA(1,0,1)
RMSE: 0.010013
MAE:  0.006982
Directional Accuracy: 46.74%
Test samples: 261


In [26]:
# Compile baseline results
results_baselines = pd.DataFrame({
    'Model': ['Naive (gold_lag1)', 'SMA (20-day)', 'ARIMA(1,0,1)'],
    'RMSE': [rmse_naive, rmse_sma, rmse_arima],
    'MAE': [mae_naive, mae_sma, mae_arima],
    'Dir. Accuracy %': [dir_acc_naive, dir_acc_sma, dir_acc_arima]
})

print("\n" + "=" * 80)
print("BASELINE MODELS EVALUATION METRICS (2016 Test Set - First Out-of-Sample Year)")
print("=" * 80)
print(results_baselines.to_string(index=False))
print("=" * 80)

# Summary stats
print("\nMetric Definitions:")
print("  • RMSE (Root Mean Squared Error): Lower is better. Penalizes large errors.")
print("  • MAE (Mean Absolute Error): Lower is better. Robust to outliers.")
print("  • Dir. Accuracy: % of time model correctly predicts up/down movement (50% = random)")
print("\nNote: 2016 is the first true out-of-sample year (trained on 2004-2015).")
print("All future ML and DL models will be evaluated on this same year for comparability.")



BASELINE MODELS EVALUATION METRICS (2016 Test Set - First Out-of-Sample Year)
            Model     RMSE      MAE  Dir. Accuracy %
Naive (gold_lag1) 0.013904 0.010316        45.977011
     SMA (20-day) 0.010212 0.007377        45.593870
     ARIMA(1,0,1) 0.010013 0.006982        46.743295

Metric Definitions:
  • RMSE (Root Mean Squared Error): Lower is better. Penalizes large errors.
  • MAE (Mean Absolute Error): Lower is better. Robust to outliers.
  • Dir. Accuracy: % of time model correctly predicts up/down movement (50% = random)

Note: 2016 is the first true out-of-sample year (trained on 2004-2015).
All future ML and DL models will be evaluated on this same year for comparability.
