[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QuantLet/EMQA/blob/main/EMQA_feature_importance/EMQA_feature_importance.ipynb)

# EMQA_feature_importance

Feature importance from a Random Forest model trained on Romanian electricity
price **changes**. Includes a comparison of feature sets: DE (German) prices
vs gas prices as cross-market signal.

**Output:** `ml_feature_importance.pdf`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({
    'figure.facecolor': 'none',
    'axes.facecolor': 'none',
    'savefig.facecolor': 'none',
    'savefig.transparent': True,
    'axes.grid': False,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'font.size': 11,
    'figure.figsize': (12, 6),
})

COLORS = {
    'blue': '#1A3A6E', 'red': '#CD0000', 'green': '#2E7D32',
    'orange': '#E67E22', 'purple': '#8E44AD', 'gray': '#808080',
    'cyan': '#00BCD4', 'amber': '#B5853F'
}

def save_fig(fig, name):
    fig.savefig(name, bbox_inches='tight', transparent=True, dpi=300)
    print(f"Saved: {name}")


In [None]:
url = 'https://raw.githubusercontent.com/QuantLet/EMQA/main/EMQA_feature_importance/ro_de_prices_full.csv'
ro = pd.read_csv(url, parse_dates=['date'], index_col='date')
print(f'Loaded {len(ro)} observations')


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# --- Build feature set predicting price CHANGES ---
data = ro.copy()
data['target'] = data['ro_price'].diff()  # predict daily price change

# RO lagged levels and changes
for lag in [1, 2, 3, 5, 7, 14, 30]:
    data[f'ro_lag_{lag}'] = data['ro_price'].shift(lag)
    data[f'ro_diff_lag_{lag}'] = data['ro_price'].diff().shift(lag)

# Rolling stats
for w in [7, 14, 30]:
    data[f'ro_ma_{w}'] = data['ro_price'].shift(1).rolling(w).mean()
    data[f'ro_std_{w}'] = data['ro_price'].shift(1).rolling(w).std()

# DE cross-market features
for lag in [1, 7]:
    data[f'de_lag_{lag}'] = data['de_price'].shift(lag)
data['spread_lag1'] = data['ro_price'].shift(1) - data['de_price'].shift(1)

# Temperature
data['ro_temp_lag1'] = data['ro_temp_mean'].shift(1)
data['hdd'] = (18 - data['ro_temp_mean'].shift(1)).clip(lower=0)
data['cdd'] = (data['ro_temp_mean'].shift(1) - 18).clip(lower=0)

# Consumption
data['consumption_lag1'] = data['ro_consumption'].shift(1)
data['consumption_lag7'] = data['ro_consumption'].shift(7)
data['residual_load_lag1'] = data['ro_residual_load'].shift(1)

# Temporal
data['dow'] = data.index.dayofweek
data['month'] = data.index.month
data['weekend'] = (data.index.dayofweek >= 5).astype(int)

data = data.dropna()
exclude = ['target', 'ro_price', 'de_price', 'gas_price',
           'de_temp_mean', 'de_temp_max', 'de_temp_min',
           'ro_temp_mean', 'ro_temp_max', 'ro_temp_min',
           'ro_nuclear', 'ro_hydro', 'ro_coal', 'ro_gas',
           'ro_wind', 'ro_solar', 'ro_consumption', 'ro_residual_load']
feature_cols = [c for c in data.columns if c not in exclude]

print(f"Dataset: {len(data)} rows, {len(feature_cols)} features")
print(f"Features: {feature_cols}")

In [None]:
# --- Train RF on price changes and extract feature importances ---
split = int(len(data) * 0.7)
X_train = data[feature_cols].iloc[:split]
y_train = data['target'].iloc[:split]

rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

importance = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=True)
top12 = importance.tail(12)

# Print exact percentages (referenced in lecture slides)
print("=== Feature Importances (top 12) ===")
for name, val in importance.sort_values(ascending=False).head(12).items():
    print(f"  {name:20s} {val*100:.1f}%")

# Assign colors by feature type
def feature_color(name):
    if 'de_' in name or 'spread' in name.lower():
        return COLORS['green']   # Cross-market
    elif name in ['dow', 'month', 'weekend']:
        return COLORS['orange']  # Temporal
    elif name in ['consumption_lag1', 'consumption_lag7', 'residual_load_lag1']:
        return COLORS['purple']  # Fundamentals
    elif name in ['ro_temp_lag1', 'hdd', 'cdd']:
        return COLORS['cyan']    # Weather
    else:
        return COLORS['blue']    # Romanian lags/stats

bar_colors = [feature_color(f) for f in top12.index]

fig, ax = plt.subplots(figsize=(12, 7))
bars = ax.barh(range(len(top12)), top12.values, color=bar_colors, alpha=0.85,
               edgecolor='white', lw=1)

ax.set_yticks(range(len(top12)))
ax.set_yticklabels(top12.index, fontsize=11)
ax.set_xlabel('Importance', fontsize=12)
ax.set_title('Top 12 Feature Importances â€” Price Change Prediction (Random Forest)',
             fontsize=14, fontweight='bold')

# Add value labels
for i, (val, bar) in enumerate(zip(top12.values, bars)):
    ax.text(val + 0.002, i, f'{val:.3f}', va='center', fontsize=10)

# Legend for color coding
import matplotlib.patches as mpatches
legend_items = [
    mpatches.Patch(color=COLORS['blue'], label='RO price lags/stats'),
    mpatches.Patch(color=COLORS['green'], label='Cross-market (DE)'),
    mpatches.Patch(color=COLORS['cyan'], label='Weather'),
    mpatches.Patch(color=COLORS['purple'], label='Fundamentals'),
    mpatches.Patch(color=COLORS['orange'], label='Temporal'),
]
ax.legend(handles=legend_items, loc='upper center', bbox_to_anchor=(0.5, -0.10),
          frameon=False, ncol=5, fontsize=10)

fig.tight_layout()
save_fig(fig, 'ml_feature_importance.pdf')
plt.show()

In [None]:
# === Feature set comparison: DE vs Gas vs both ===
# Build alternative feature sets with gas instead of DE

def build_variant(ro, cross_market='de'):
    """Build feature set with specified cross-market signal."""
    d = ro.copy()
    d['target'] = d['ro_price'].diff()
    for lag in [1, 2, 3, 5, 7, 14, 30]:
        d[f'ro_lag_{lag}'] = d['ro_price'].shift(lag)
        d[f'ro_diff_lag_{lag}'] = d['ro_price'].diff().shift(lag)
    for w in [7, 14, 30]:
        d[f'ro_ma_{w}'] = d['ro_price'].shift(1).rolling(w).mean()
        d[f'ro_std_{w}'] = d['ro_price'].shift(1).rolling(w).std()
    if cross_market in ('de', 'both'):
        for lag in [1, 7]:
            d[f'de_lag_{lag}'] = d['de_price'].shift(lag)
        d['de_spread_lag1'] = d['ro_price'].shift(1) - d['de_price'].shift(1)
    if cross_market in ('gas', 'both'):
        for lag in [1, 7]:
            d[f'gas_lag_{lag}'] = d['gas_price'].shift(lag)
        d['gas_spread_lag1'] = d['ro_price'].shift(1) - d['gas_price'].shift(1)
    d['ro_temp_lag1'] = d['ro_temp_mean'].shift(1)
    d['hdd'] = (18 - d['ro_temp_mean'].shift(1)).clip(lower=0)
    d['cdd'] = (d['ro_temp_mean'].shift(1) - 18).clip(lower=0)
    d['consumption_lag1'] = d['ro_consumption'].shift(1)
    d['consumption_lag7'] = d['ro_consumption'].shift(7)
    d['residual_load_lag1'] = d['ro_residual_load'].shift(1)
    d['dow'] = d.index.dayofweek
    d['month'] = d.index.month
    d['weekend'] = (d.index.dayofweek >= 5).astype(int)
    d = d.dropna()
    raw_cols = ['target', 'ro_price', 'de_price', 'gas_price',
                'de_temp_mean', 'de_temp_max', 'de_temp_min',
                'ro_temp_mean', 'ro_temp_max', 'ro_temp_min',
                'ro_nuclear', 'ro_hydro', 'ro_coal', 'ro_gas',
                'ro_wind', 'ro_solar', 'ro_consumption', 'ro_residual_load']
    feats = [c for c in d.columns if c not in raw_cols]
    return d, feats

def rolling_evaluate(data, feats, label):
    """Rolling expanding-window RF evaluation on price changes."""
    init_train = int(len(data) * 0.6)
    retrain_every = 30
    rf = None
    level_preds, level_actuals, naive_preds = [], [], []
    correct_dir = 0
    for i in range(init_train, len(data)):
        if (i - init_train) % retrain_every == 0:
            rf = RandomForestRegressor(n_estimators=200, max_depth=10,
                                       random_state=42, n_jobs=-1)
            rf.fit(data[feats].iloc[:i].values, data['target'].iloc[:i].values)
        pred_chg = rf.predict(data[feats].iloc[i:i+1].values)[0]
        act_chg = data['target'].iloc[i]
        prev = data['ro_price'].iloc[i - 1]
        level_preds.append(prev + pred_chg)
        level_actuals.append(prev + act_chg)
        naive_preds.append(prev)
        if (pred_chg > 0) == (act_chg > 0):
            correct_dir += 1
    a, p, n = np.array(level_actuals), np.array(level_preds), np.array(naive_preds)
    mae = mean_absolute_error(a, p)
    r2_oos = 1 - mean_squared_error(a, p) / mean_squared_error(a, n)
    direc = correct_dir / len(a) * 100
    naive_mae = mean_absolute_error(a, n)
    print(f"  {label:25s}  MAE={mae:.1f}  R2_OOS={r2_oos*100:.1f}%  Dir={direc:.0f}%  (Naive={naive_mae:.1f})")
    return {'MAE': mae, 'R2_OOS': r2_oos, 'Direction': direc}

print("Correlations:")
print(f"  RO-DE:  {ro['ro_price'].corr(ro['de_price']):.3f}")
print(f"  RO-Gas: {ro['ro_price'].corr(ro['gas_price']):.3f}")

print("\n=== Feature Set Comparison (RF, price-change, expanding window) ===")
d_de, f_de = build_variant(ro, 'de')
d_gas, f_gas = build_variant(ro, 'gas')
d_both, f_both = build_variant(ro, 'both')

res_de = rolling_evaluate(d_de, f_de, "DE features")
res_gas = rolling_evaluate(d_gas, f_gas, "Gas features")
res_both = rolling_evaluate(d_both, f_both, "DE + Gas features")

print(f"\n=== Summary ===")
print(f"  DE only:    R2_OOS = {res_de['R2_OOS']*100:.1f}%, MAE = {res_de['MAE']:.1f}")
print(f"  Gas only:   R2_OOS = {res_gas['R2_OOS']*100:.1f}%, MAE = {res_gas['MAE']:.1f}")
print(f"  DE + Gas:   R2_OOS = {res_both['R2_OOS']*100:.1f}%, MAE = {res_both['MAE']:.1f}")
print(f"\nGas features give comparable performance to DE features.")
print(f"Combining both adds little: DE prices already incorporate gas costs.")