[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QuantLet/EMQA/blob/main/EMQA_ml_rf/EMQA_ml_rf.ipynb)

# EMQA_ml_rf
Random Forest for energy price prediction -- feature importance analysis.
**Output:** `ml_rf_importance.pdf`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({
    'figure.facecolor': 'none',
    'axes.facecolor': 'none',
    'savefig.facecolor': 'none',
    'savefig.transparent': True,
    'axes.grid': False,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'font.size': 11,
    'figure.figsize': (12, 6),
})

COLORS = {
    'blue': '#1A3A6E', 'red': '#CD0000', 'green': '#2E7D32',
    'orange': '#E67E22', 'purple': '#8E44AD', 'gray': '#808080',
    'cyan': '#00BCD4', 'amber': '#B5853F'
}

def save_fig(fig, name):
    fig.savefig(name, bbox_inches='tight', transparent=True, dpi=300)
    print(f"Saved: {name}")


In [None]:
import yfinance as yf

def fetch(ticker, start='2020-01-01', end='2025-12-31'):
    d = yf.download(ticker, start=start, end=end, progress=False)
    if isinstance(d.columns, pd.MultiIndex):
        return d['Close'].squeeze().dropna()
    return d['Close'].dropna()


In [None]:
# Fetch Brent and create features
brent = fetch('BZ=F', start='2018-01-01')
df = pd.DataFrame({'price': brent})
df['return'] = np.log(df['price'] / df['price'].shift(1))

# Lag features
for lag in [1, 2, 3, 7, 14]:
    df[f'ret_lag_{lag}'] = df['return'].shift(lag)

# Rolling statistics
df['roll_mean_5'] = df['return'].rolling(5).mean()
df['roll_std_5'] = df['return'].rolling(5).std()
df['roll_mean_20'] = df['return'].rolling(20).mean()
df['roll_std_20'] = df['return'].rolling(20).std()
df['roll_skew_20'] = df['return'].rolling(20).skew()

# Target: next-day return
df['target'] = df['return'].shift(-1)
df = df.dropna()

feature_cols = [c for c in df.columns if c not in ['price', 'return', 'target']]
X = df[feature_cols].values
y = df['target'].values

# Train/test split (80/20)
split = int(len(X) * 0.8)
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print(f"Features: {feature_cols}")
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.6f}")
print(f"R-squared: {r2:.4f}")

# Feature importance
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=feature_cols).sort_values(ascending=True)

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))

colors = [COLORS['blue'] if v < feat_imp.quantile(0.75) else COLORS['red'] for v in feat_imp.values]
ax.barh(feat_imp.index, feat_imp.values, color=colors, edgecolor='white', height=0.6)
ax.set_xlabel('Feature Importance')
ax.set_title(f'Random Forest Feature Importance (MAE={mae:.5f}, $R^2$={r2:.3f})')

for i, (val, name_f) in enumerate(zip(feat_imp.values, feat_imp.index)):
    ax.text(val + 0.002, i, f'{val:.3f}', va='center', fontsize=9)

plt.tight_layout()
save_fig(fig, 'ml_rf_importance.pdf')
plt.show()