[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QuantLet/EMQA/blob/main/EMQA_ml_rf/EMQA_ml_rf.ipynb)

# EMQA_ml_rf
Random Forest rolling 1-step-ahead forecast with bootstrap confidence intervals for Brent crude oil.
**Output:** `ml_rf_importance.pdf`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({
    'figure.facecolor': 'none',
    'axes.facecolor': 'none',
    'savefig.facecolor': 'none',
    'savefig.transparent': True,
    'axes.grid': False,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'font.size': 11,
    'figure.figsize': (12, 6),
})

COLORS = {
    'blue': '#1A3A6E', 'red': '#CD0000', 'green': '#2E7D32',
    'orange': '#E67E22', 'purple': '#8E44AD', 'gray': '#808080',
    'cyan': '#00BCD4', 'amber': '#B5853F'
}

def save_fig(fig, name):
    fig.savefig(name, bbox_inches='tight', transparent=True, dpi=300)
    print(f"Saved: {name}")

In [None]:
import yfinance as yf

def fetch(ticker, start='2020-01-01', end='2025-12-31'):
    d = yf.download(ticker, start=start, end=end, progress=False)
    if isinstance(d.columns, pd.MultiIndex):
        return d['Close'].squeeze().dropna()
    return d['Close'].dropna()

brent = fetch('BZ=F', start='2018-01-01')
df = pd.DataFrame({'price': brent})
df['return'] = np.log(df['price'] / df['price'].shift(1))

# Features: lag1-lag5 returns, rolling vol and mean (5,20)
for lag in range(1, 6):
    df[f'ret_lag_{lag}'] = df['return'].shift(lag)

df['roll_vol_5']  = df['return'].rolling(5).std()
df['roll_vol_20'] = df['return'].rolling(20).std()
df['roll_mean_5']  = df['return'].rolling(5).mean()
df['roll_mean_20'] = df['return'].rolling(20).mean()

# Target: next-day return
df['target'] = df['return'].shift(-1)
df = df.dropna()

feature_cols = [c for c in df.columns if c not in ['price', 'return', 'target']]
print(f"Features: {feature_cols}")
print(f"Total observations: {len(df)}")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Rolling expanding-window 1-step-ahead forecast
init_train = int(len(df) * 0.7)
retrain_every = 20

predictions = []
ci_lower_list = []
ci_upper_list = []
actuals = []
dates = []
prices_prev = []  # price at forecast origin (for converting returns to levels)

rf = None
all_importances = []

for i in range(init_train, len(df)):
    # Retrain every 20 steps or first iteration
    if rf is None or (i - init_train) % retrain_every == 0:
        X_train = df[feature_cols].iloc[:i].values
        y_train = df['target'].iloc[:i].values
        rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
        rf.fit(X_train, y_train)
        all_importances.append(rf.feature_importances_)

    X_step = df[feature_cols].iloc[i:i+1].values

    # Point prediction
    pred_return = rf.predict(X_step)[0]

    # Bootstrap CI from individual trees
    tree_preds = np.array([tree.predict(X_step)[0] for tree in rf.estimators_])
    ci_lo, ci_hi = np.percentile(tree_preds, [2.5, 97.5])

    predictions.append(pred_return)
    ci_lower_list.append(ci_lo)
    ci_upper_list.append(ci_hi)
    actuals.append(df['target'].iloc[i])
    dates.append(df.index[i])
    prices_prev.append(df['price'].iloc[i])

# Convert to arrays
predictions = np.array(predictions)
ci_lower_arr = np.array(ci_lower_list)
ci_upper_arr = np.array(ci_upper_list)
actuals = np.array(actuals)
prices_prev = np.array(prices_prev)
dates = pd.DatetimeIndex(dates)

# Convert return forecasts + CI back to price levels
price_pred   = prices_prev * np.exp(predictions)
price_ci_lo  = prices_prev * np.exp(ci_lower_arr)
price_ci_hi  = prices_prev * np.exp(ci_upper_arr)
price_actual = prices_prev * np.exp(actuals)

# Average feature importances across all retraining rounds
avg_importance = np.mean(all_importances, axis=0)

# Metrics
r2  = r2_score(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
rmse = np.sqrt(mean_squared_error(actuals, predictions))
direction_acc = np.mean(np.sign(predictions) == np.sign(actuals))

print(f"R-squared:          {r2:.4f}")
print(f"MAE:                {mae:.6f}")
print(f"RMSE:               {rmse:.6f}")
print(f"Direction Accuracy: {direction_acc:.2%}")

In [None]:
# Plot 1: Actual price vs RF forecast price with 95% CI band
fig, ax = plt.subplots(figsize=(12, 6))

ax.plot(dates, price_actual, color=COLORS['blue'], lw=1.5, label='Actual Price')
ax.plot(dates, price_pred, color=COLORS['red'], lw=1.5, ls='--', label='RF Forecast')
ax.fill_between(dates, price_ci_lo, price_ci_hi,
                color=COLORS['red'], alpha=0.12, label='95% CI (tree bootstrap)')

ax.set_xlabel('Date')
ax.set_ylabel('Price (USD/bbl)')
ax.set_title(f'Brent Crude: Rolling RF 1-Step-Ahead Forecast\n'
             f'R$^2$={r2:.3f}, MAE={mae:.5f}, Dir. Acc.={direction_acc:.1%}')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.10), frameon=False, ncol=3)

plt.tight_layout()
plt.show()

In [None]:
# Plot 2: Feature importance bar chart
feat_imp = pd.Series(avg_importance, index=feature_cols).sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(10, 7))

colors = [COLORS['blue'] if v < feat_imp.quantile(0.75) else COLORS['red'] for v in feat_imp.values]
ax.barh(feat_imp.index, feat_imp.values, color=colors, edgecolor='white', height=0.6)
ax.set_xlabel('Feature Importance (avg. across retraining rounds)')
ax.set_title(f'Random Forest Feature Importance\n'
             f'Rolling forecast: R$^2$={r2:.3f}, MAE={mae:.5f}')

for i, (val, name_f) in enumerate(zip(feat_imp.values, feat_imp.index)):
    ax.text(val + 0.002, i, f'{val:.3f}', va='center', fontsize=9)

plt.tight_layout()
save_fig(fig, 'ml_rf_importance.pdf')
plt.show()