[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QuantLet/EMQA/blob/main/EMQA_actual_vs_predicted/EMQA_actual_vs_predicted.ipynb)

# EMQA_actual_vs_predicted

Actual vs predicted evaluation: time series overlay and scatter plot with confidence intervals.

**Output:** `ml_actual_vs_predicted.pdf`


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({
    'figure.facecolor': 'none',
    'axes.facecolor': 'none',
    'savefig.facecolor': 'none',
    'savefig.transparent': True,
    'axes.grid': False,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'font.size': 11,
    'figure.figsize': (12, 6),
})

COLORS = {
    'blue': '#1A3A6E', 'red': '#CD0000', 'green': '#2E7D32',
    'orange': '#E67E22', 'purple': '#8E44AD', 'gray': '#808080',
    'cyan': '#00BCD4', 'amber': '#B5853F'
}

def save_fig(fig, name):
    fig.savefig(name, bbox_inches='tight', transparent=True, dpi=300)
    print(f"Saved: {name}")


In [None]:
import os

def load_ro_de():
    paths = [
        '../../charts/ro_de_prices_full.csv',
        '/Users/danielpele/Documents/Energy MBA/charts/ro_de_prices_full.csv',
    ]
    for p in paths:
        if os.path.exists(p):
            return pd.read_csv(p, parse_dates=['date'], index_col='date')
    raise FileNotFoundError("ro_de_prices_full.csv not found")

df = load_ro_de()


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

df = load_ro_de()
data = df[['ro_price', 'de_price']].dropna().copy()
data['target'] = data['ro_price']

# Lagged features
for lag in [1, 2, 7, 14, 30]:
    data[f'ro_lag_{lag}'] = data['ro_price'].shift(lag)
for lag in [1, 7]:
    data[f'de_lag_{lag}'] = data['de_price'].shift(lag)

# Rolling stats
for w in [7, 14, 30]:
    data[f'ro_ma_{w}'] = data['ro_price'].shift(1).rolling(w).mean()
    data[f'ro_std_{w}'] = data['ro_price'].shift(1).rolling(w).std()

# Temporal
data['dow'] = data.index.dayofweek
data['month'] = data.index.month
data['weekend'] = (data.index.dayofweek >= 5).astype(int)

data = data.dropna()
feature_cols = [c for c in data.columns if c not in ['target', 'ro_price', 'de_price']]

print(f"Dataset: {len(data)} rows, {len(feature_cols)} features")
print(f"Features: {feature_cols}")


In [None]:
# --- Train models and get ensemble predictions ---
split = int(len(data) * 0.7)
X_train, X_test = data[feature_cols].iloc[:split], data[feature_cols].iloc[split:]
y_train, y_test = data['target'].iloc[:split], data['target'].iloc[split:]

rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_pred = pd.Series(rf.predict(X_test), index=y_test.index)

gb = GradientBoostingRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)
gb_pred = pd.Series(gb.predict(X_test), index=y_test.index)

ens_pred = (rf_pred + gb_pred) / 2

# Error stats
residuals = y_test - ens_pred
mae = mean_absolute_error(y_test, ens_pred)
r2 = r2_score(y_test, ens_pred)
std_resid = residuals.std()
print(f"Ensemble MAE: {mae:.2f}, R2: {r2:.4f}, Residual Std: {std_resid:.2f}")


In [None]:
# --- 2-panel chart ---
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# (A) Time series of last 60 test days
last_n = 60
y_last = y_test.iloc[-last_n:]
pred_last = ens_pred.iloc[-last_n:]

# 95% CI
ci_upper = pred_last + 1.96 * std_resid
ci_lower = pred_last - 1.96 * std_resid

ax = axes[0]
ax.plot(y_last.index, y_last.values, color=COLORS['blue'], lw=1.8, label='Actual')
ax.plot(pred_last.index, pred_last.values, color=COLORS['red'], lw=1.8, ls='--', label='Ensemble Prediction')
ax.fill_between(pred_last.index, ci_lower, ci_upper,
                color=COLORS['red'], alpha=0.12, label='95% CI')
ax.set_title('(A) Last 60 Test Days: Actual vs Predicted', fontsize=13, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Price (EUR/MWh)')
ax.tick_params(axis='x', rotation=30)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), frameon=False, ncol=3)

# (B) Scatter plot
ax2 = axes[1]
ax2.scatter(y_test.values, ens_pred.values, color=COLORS['blue'], alpha=0.3, s=15, edgecolors='none')

# Perfect prediction line
lims = [min(y_test.min(), ens_pred.min()), max(y_test.max(), ens_pred.max())]
ax2.plot(lims, lims, color=COLORS['red'], ls='--', lw=1.5, label='Perfect Prediction')

# Stats box
textstr = f'R$^2$ = {r2:.3f}\nMAE = {mae:.1f} EUR/MWh'
props = dict(boxstyle='round,pad=0.4', facecolor='white', alpha=0.8, edgecolor=COLORS['gray'])
ax2.text(0.05, 0.95, textstr, transform=ax2.transAxes, fontsize=11,
         verticalalignment='top', bbox=props)

ax2.set_title('(B) Scatter: Actual vs Predicted', fontsize=13, fontweight='bold')
ax2.set_xlabel('Actual Price (EUR/MWh)')
ax2.set_ylabel('Predicted Price (EUR/MWh)')
ax2.legend(loc='upper center', bbox_to_anchor=(0.5, -0.10), frameon=False)

fig.suptitle('Ensemble Model Evaluation', fontsize=15, fontweight='bold', y=1.02)
fig.tight_layout()
save_fig(fig, 'ml_actual_vs_predicted.pdf')
plt.show()
