[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QuantLet/EMQA/blob/main/EMQA_model_comparison/EMQA_model_comparison.ipynb)

# EMQA_model_comparison

ML model comparison (Naive, Random Forest, Gradient Boosting, Ensemble) on Romanian electricity price data.

**Output:** `ml_model_comparison.pdf`


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({
    'figure.facecolor': 'none',
    'axes.facecolor': 'none',
    'savefig.facecolor': 'none',
    'savefig.transparent': True,
    'axes.grid': False,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'font.size': 11,
    'figure.figsize': (12, 6),
})

COLORS = {
    'blue': '#1A3A6E', 'red': '#CD0000', 'green': '#2E7D32',
    'orange': '#E67E22', 'purple': '#8E44AD', 'gray': '#808080',
    'cyan': '#00BCD4', 'amber': '#B5853F'
}

def save_fig(fig, name):
    fig.savefig(name, bbox_inches='tight', transparent=True, dpi=300)
    print(f"Saved: {name}")


In [None]:
url = 'https://raw.githubusercontent.com/QuantLet/EMQA/main/EMQA_model_comparison/ro_de_prices_full.csv'
ro = pd.read_csv(url, parse_dates=['date'], index_col='date')
print(f'Loaded {len(ro)} observations')


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

df = load_ro_de()
data = df[['ro_price', 'de_price']].dropna().copy()
data['target'] = data['ro_price']

# Lagged features
for lag in [1, 2, 7, 14, 30]:
    data[f'ro_lag_{lag}'] = data['ro_price'].shift(lag)
for lag in [1, 7]:
    data[f'de_lag_{lag}'] = data['de_price'].shift(lag)

# Rolling stats
for w in [7, 14, 30]:
    data[f'ro_ma_{w}'] = data['ro_price'].shift(1).rolling(w).mean()
    data[f'ro_std_{w}'] = data['ro_price'].shift(1).rolling(w).std()

# Temporal
data['dow'] = data.index.dayofweek
data['month'] = data.index.month
data['weekend'] = (data.index.dayofweek >= 5).astype(int)

data = data.dropna()
feature_cols = [c for c in data.columns if c not in ['target', 'ro_price', 'de_price']]

print(f"Dataset: {len(data)} rows, {len(feature_cols)} features")
print(f"Features: {feature_cols}")


In [None]:
# --- Train/test split (time series: 70/30) ---
split = int(len(data) * 0.7)
X_train, X_test = data[feature_cols].iloc[:split], data[feature_cols].iloc[split:]
y_train, y_test = data['target'].iloc[:split], data['target'].iloc[split:]

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# --- Models ---
# Naive baseline: lag-1
naive_pred = data['ro_price'].shift(1).iloc[split:].reindex(y_test.index)

# Random Forest
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_pred = pd.Series(rf.predict(X_test), index=y_test.index)

# Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)
gb_pred = pd.Series(gb.predict(X_test), index=y_test.index)

# Simple average ensemble
ens_pred = (rf_pred + gb_pred) / 2

# --- Metrics ---
models = {
    'Naive (lag-1)': naive_pred,
    'Random Forest': rf_pred,
    'GradientBoosting': gb_pred,
    'Ensemble (Avg)': ens_pred,
}

results = {}
for name, pred in models.items():
    mask = pred.notna() & y_test.notna()
    mae = mean_absolute_error(y_test[mask], pred[mask])
    r2 = r2_score(y_test[mask], pred[mask])
    results[name] = {'MAE': mae, 'R2': r2}
    print(f"{name:20s}  MAE={mae:.2f}  R2={r2:.4f}")


In [None]:
# --- Chart: 1x2 MAE and R2 bars ---
model_names = list(results.keys())
maes = [results[m]['MAE'] for m in model_names]
r2s = [results[m]['R2'] for m in model_names]
bar_colors = [COLORS['gray'], COLORS['green'], COLORS['purple'], COLORS['red']]

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# (A) MAE
ax = axes[0]
bars = ax.bar(model_names, maes, color=bar_colors, alpha=0.8, edgecolor='white', lw=1.5)
for bar, val in zip(bars, maes):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
            f'{val:.1f}', ha='center', va='bottom', fontsize=11, fontweight='bold')
ax.set_title('(A) Mean Absolute Error (lower is better)', fontsize=13, fontweight='bold')
ax.set_ylabel('MAE (EUR/MWh)')
ax.tick_params(axis='x', rotation=20)

# (B) R2
ax2 = axes[1]
bars2 = ax2.bar(model_names, r2s, color=bar_colors, alpha=0.8, edgecolor='white', lw=1.5)
for bar, val in zip(bars2, r2s):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
             f'{val:.3f}', ha='center', va='bottom', fontsize=11, fontweight='bold')
ax2.set_title('(B) R-squared (higher is better)', fontsize=13, fontweight='bold')
ax2.set_ylabel('R$^2$')
ax2.tick_params(axis='x', rotation=20)

fig.suptitle('Model Comparison: Romanian Electricity Price Forecasting',
             fontsize=15, fontweight='bold', y=1.02)
fig.tight_layout()
save_fig(fig, 'ml_model_comparison.pdf')
plt.show()
