[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QuantLet/EMQA/blob/main/EMQA_ml_monitor/EMQA_ml_monitor.ipynb)

# EMQA_ml_monitor

Model monitoring and drift detection for ML forecasting in electricity markets.
Demonstrates techniques to detect when a deployed model needs retraining.

**Key Concepts:**
1. **Performance Monitoring**: Track error metrics over time
2. **Data Drift**: Detect changes in input feature distributions
3. **Concept Drift**: Detect changes in the target-feature relationship
4. **Recalibration Triggers**: When to retrain the model

**Output:** `ml_monitor.pdf`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Professional plot styling
plt.rcParams.update({
    'figure.facecolor': 'none',
    'axes.facecolor': 'none',
    'savefig.facecolor': 'none',
    'savefig.transparent': True,
    'axes.grid': False,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'font.size': 11,
    'figure.figsize': (12, 6),
})

COLORS = {
    'blue': '#1A3A6E', 'red': '#CD0000', 'green': '#2E7D32',
    'orange': '#E67E22', 'purple': '#8E44AD', 'gray': '#808080',
    'cyan': '#00BCD4', 'amber': '#B5853F'
}

def save_fig(fig, name):
    fig.savefig(name, bbox_inches='tight', transparent=True, dpi=300)
    print(f"Saved: {name}")

In [None]:
# Load data and train a model for monitoring demonstration
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

url = 'https://raw.githubusercontent.com/QuantLet/EMQA/main/EMQA_actual_vs_predicted/ro_de_prices_full.csv'
df = pd.read_csv(url, parse_dates=['date'], index_col='date')
print(f'Loaded {len(df)} observations')

# Feature engineering
data = df[['ro_price']].copy()
data['target'] = data['ro_price'].shift(-1)

for lag in [1, 2, 3, 7, 14]:
    data[f'ro_lag_{lag}'] = data['ro_price'].shift(lag)

for w in [7, 14, 30]:
    data[f'ro_ma_{w}'] = data['ro_price'].shift(1).rolling(w).mean()
    data[f'ro_std_{w}'] = data['ro_price'].shift(1).rolling(w).std()

data['day_of_week'] = data.index.dayofweek
data['month'] = data.index.month

if 'de_price' in df.columns:
    data['de_lag_1'] = df['de_price'].shift(1)

data = data.dropna()
feature_cols = [c for c in data.columns if c not in ['target', 'ro_price']]
print(f"Features: {len(feature_cols)}")

## 1. Performance Monitoring

Track model performance over rolling windows to detect degradation.
A model trained on historical data may lose predictive power as market conditions change.

In [None]:
# Train initial model on first 50% of data
train_end = int(len(data) * 0.5)
X_train = data[feature_cols].iloc[:train_end]
y_train = data['target'].iloc[:train_end]

model = RandomForestRegressor(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Generate predictions for the monitoring period (remaining 50%)
monitor_data = data.iloc[train_end:]
X_monitor = monitor_data[feature_cols]
y_monitor = monitor_data['target']

predictions = model.predict(X_monitor)
errors = y_monitor.values - predictions

print(f"Training period: {data.index[0].date()} to {data.index[train_end-1].date()}")
print(f"Monitoring period: {monitor_data.index[0].date()} to {monitor_data.index[-1].date()}")
print(f"Monitoring observations: {len(monitor_data)}")

In [None]:
# Calculate rolling performance metrics (30-day windows)
window = 30

rolling_mae = pd.Series(np.abs(errors), index=monitor_data.index).rolling(window).mean()
rolling_rmse = pd.Series(errors**2, index=monitor_data.index).rolling(window).mean().apply(np.sqrt)
rolling_bias = pd.Series(errors, index=monitor_data.index).rolling(window).mean()  # Mean error (bias)

# Baseline: performance on training set
train_pred = model.predict(X_train)
baseline_mae = mean_absolute_error(y_train, train_pred)
baseline_rmse = np.sqrt(mean_squared_error(y_train, train_pred))

print(f"Baseline MAE (training): {baseline_mae:.2f} EUR/MWh")
print(f"Baseline RMSE (training): {baseline_rmse:.2f} EUR/MWh")

In [None]:
# Visualize performance monitoring
fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# Panel 1: Rolling MAE with baseline threshold
ax1 = axes[0]
ax1.plot(rolling_mae.index, rolling_mae.values, color=COLORS['blue'], lw=1.5, label='Rolling MAE (30d)')
ax1.axhline(baseline_mae, color=COLORS['green'], ls='--', lw=1.5, label=f'Baseline MAE = {baseline_mae:.1f}')
ax1.axhline(baseline_mae * 1.5, color=COLORS['red'], ls='--', lw=1.5, label='Alert Threshold (1.5x)')

# Highlight periods exceeding threshold
alert_mask = rolling_mae > baseline_mae * 1.5
ax1.fill_between(rolling_mae.index, 0, rolling_mae.values, 
                 where=alert_mask, color=COLORS['red'], alpha=0.2)

ax1.set_ylabel('MAE (EUR/MWh)')
ax1.set_title('Performance Monitoring: Rolling MAE', fontsize=13, fontweight='bold')
ax1.legend(loc='upper right', frameon=False)

# Panel 2: Rolling RMSE
ax2 = axes[1]
ax2.plot(rolling_rmse.index, rolling_rmse.values, color=COLORS['purple'], lw=1.5, label='Rolling RMSE (30d)')
ax2.axhline(baseline_rmse, color=COLORS['green'], ls='--', lw=1.5, label=f'Baseline RMSE = {baseline_rmse:.1f}')
ax2.set_ylabel('RMSE (EUR/MWh)')
ax2.set_title('Performance Monitoring: Rolling RMSE', fontsize=13, fontweight='bold')
ax2.legend(loc='upper right', frameon=False)

# Panel 3: Rolling bias (systematic under/over prediction)
ax3 = axes[2]
ax3.fill_between(rolling_bias.index, 0, rolling_bias.values, 
                 where=rolling_bias >= 0, color=COLORS['green'], alpha=0.5, label='Over-prediction')
ax3.fill_between(rolling_bias.index, 0, rolling_bias.values, 
                 where=rolling_bias < 0, color=COLORS['red'], alpha=0.5, label='Under-prediction')
ax3.axhline(0, color=COLORS['gray'], ls='-', lw=1)

ax3.set_xlabel('Date')
ax3.set_ylabel('Bias (EUR/MWh)')
ax3.set_title('Performance Monitoring: Rolling Bias (Mean Error)', fontsize=13, fontweight='bold')
ax3.legend(loc='upper right', frameon=False)

fig.suptitle('Model Performance Monitoring Dashboard', fontsize=15, fontweight='bold', y=1.01)
fig.tight_layout()
plt.show()

## 2. Data Drift Detection

Data drift occurs when the distribution of input features changes over time.
We use statistical tests to detect significant distribution shifts.

In [None]:
def calculate_psi(reference, current, bins=10):
    """
    Calculate Population Stability Index (PSI) for drift detection.
    PSI < 0.1: No significant drift
    PSI 0.1-0.2: Moderate drift (monitor)
    PSI > 0.2: Significant drift (action required)
    """
    # Create bins based on reference distribution
    breakpoints = np.percentile(reference, np.linspace(0, 100, bins + 1))
    breakpoints = np.unique(breakpoints)  # Remove duplicates
    
    # Calculate proportions in each bin
    ref_counts = np.histogram(reference, bins=breakpoints)[0] + 1  # Add 1 to avoid division by zero
    cur_counts = np.histogram(current, bins=breakpoints)[0] + 1
    
    ref_pct = ref_counts / ref_counts.sum()
    cur_pct = cur_counts / cur_counts.sum()
    
    # PSI formula
    psi = np.sum((cur_pct - ref_pct) * np.log(cur_pct / ref_pct))
    return psi

def ks_test_drift(reference, current):
    """
    Kolmogorov-Smirnov test for distribution comparison.
    Returns: (KS statistic, p-value)
    """
    return stats.ks_2samp(reference, current)

In [None]:
# Compare feature distributions: training vs monitoring period
# Split monitoring into quarters for temporal analysis
quarter_size = len(monitor_data) // 4
quarters = [
    ('Q1', monitor_data.iloc[:quarter_size]),
    ('Q2', monitor_data.iloc[quarter_size:2*quarter_size]),
    ('Q3', monitor_data.iloc[2*quarter_size:3*quarter_size]),
    ('Q4', monitor_data.iloc[3*quarter_size:])
]

# Calculate drift metrics for key features
key_features = ['ro_lag_1', 'ro_ma_7', 'ro_std_7', 'day_of_week']

drift_results = []
for feat in key_features:
    ref_data = X_train[feat].values
    for q_name, q_data in quarters:
        cur_data = q_data[feat].values
        psi = calculate_psi(ref_data, cur_data)
        ks_stat, ks_pval = ks_test_drift(ref_data, cur_data)
        drift_results.append({
            'Feature': feat,
            'Period': q_name,
            'PSI': psi,
            'KS_Stat': ks_stat,
            'KS_pval': ks_pval,
            'Drift': 'Yes' if psi > 0.2 or ks_pval < 0.01 else ('Monitor' if psi > 0.1 else 'No')
        })

drift_df = pd.DataFrame(drift_results)
print("Data Drift Analysis:")
print(drift_df.pivot(index='Feature', columns='Period', values='Drift'))

In [None]:
# Visualize data drift using PSI heatmap
psi_pivot = drift_df.pivot(index='Feature', columns='Period', values='PSI')

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Panel 1: PSI Heatmap
ax1 = axes[0]
im = ax1.imshow(psi_pivot.values, cmap='RdYlGn_r', aspect='auto', vmin=0, vmax=0.3)

ax1.set_xticks(range(len(psi_pivot.columns)))
ax1.set_xticklabels(psi_pivot.columns)
ax1.set_yticks(range(len(psi_pivot.index)))
ax1.set_yticklabels(psi_pivot.index)

# Add value annotations
for i in range(len(psi_pivot.index)):
    for j in range(len(psi_pivot.columns)):
        val = psi_pivot.values[i, j]
        color = 'white' if val > 0.15 else 'black'
        ax1.text(j, i, f'{val:.2f}', ha='center', va='center', color=color, fontsize=10)

ax1.set_title('Population Stability Index (PSI) by Feature', fontsize=13, fontweight='bold')

# Colorbar
cbar = plt.colorbar(im, ax=ax1, shrink=0.8)
cbar.set_label('PSI')

# Panel 2: Distribution comparison for one feature
ax2 = axes[1]
feat_to_show = 'ro_lag_1'

ax2.hist(X_train[feat_to_show], bins=30, alpha=0.6, color=COLORS['blue'], 
         density=True, label='Training', edgecolor='white')
ax2.hist(X_monitor[feat_to_show], bins=30, alpha=0.6, color=COLORS['red'],
         density=True, label='Monitoring', edgecolor='white')

ax2.set_xlabel(f'{feat_to_show} (EUR/MWh)')
ax2.set_ylabel('Density')
ax2.set_title(f'Distribution Shift: {feat_to_show}', fontsize=13, fontweight='bold')
ax2.legend(loc='upper right', frameon=False)

# Add KS test result
ks_stat, ks_pval = ks_test_drift(X_train[feat_to_show].values, X_monitor[feat_to_show].values)
ax2.text(0.05, 0.95, f'KS stat: {ks_stat:.3f}\np-value: {ks_pval:.4f}', 
         transform=ax2.transAxes, fontsize=10, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

fig.suptitle('Data Drift Detection', fontsize=15, fontweight='bold', y=1.02)
fig.tight_layout()
plt.show()

## 3. Concept Drift Detection

Concept drift occurs when the relationship between features and target changes.
We detect this by monitoring:
- Prediction residuals distribution
- Feature importance stability
- CUSUM control charts

In [None]:
def cusum_control(errors, threshold=5.0, drift_k=0.5):
    """
    CUSUM (Cumulative Sum) control chart for drift detection.
    Detects systematic shifts in error distribution.
    
    Returns:
        cusum_pos: Upper CUSUM (detects positive shift)
        cusum_neg: Lower CUSUM (detects negative shift)
        alerts: Boolean array of drift alerts
    """
    n = len(errors)
    errors_standardized = (errors - np.mean(errors[:100])) / np.std(errors[:100])  # Standardize using initial period
    
    cusum_pos = np.zeros(n)
    cusum_neg = np.zeros(n)
    
    for i in range(1, n):
        cusum_pos[i] = max(0, cusum_pos[i-1] + errors_standardized[i] - drift_k)
        cusum_neg[i] = max(0, cusum_neg[i-1] - errors_standardized[i] - drift_k)
    
    alerts = (cusum_pos > threshold) | (cusum_neg > threshold)
    return cusum_pos, cusum_neg, alerts

# Calculate CUSUM for prediction errors
cusum_pos, cusum_neg, drift_alerts = cusum_control(errors, threshold=4.0)

print(f"Drift alerts detected: {drift_alerts.sum()} out of {len(drift_alerts)} observations")
if drift_alerts.any():
    first_alert = monitor_data.index[np.argmax(drift_alerts)]
    print(f"First drift alert: {first_alert.date()}")

In [None]:
# Visualize CUSUM control chart
fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

# Panel 1: Prediction errors
ax1 = axes[0]
ax1.plot(monitor_data.index, errors, color=COLORS['blue'], lw=0.8, alpha=0.7)
ax1.axhline(0, color=COLORS['gray'], ls='-', lw=1)
ax1.axhline(np.mean(errors[:100]) + 2*np.std(errors[:100]), color=COLORS['orange'], ls='--', lw=1, label='+2 std')
ax1.axhline(np.mean(errors[:100]) - 2*np.std(errors[:100]), color=COLORS['orange'], ls='--', lw=1, label='-2 std')

# Highlight drift periods
ax1.fill_between(monitor_data.index, ax1.get_ylim()[0], ax1.get_ylim()[1],
                 where=drift_alerts, color=COLORS['red'], alpha=0.2, label='Drift Alert')

ax1.set_ylabel('Prediction Error (EUR/MWh)')
ax1.set_title('Prediction Errors with Control Limits', fontsize=13, fontweight='bold')
ax1.legend(loc='upper right', frameon=False)

# Panel 2: CUSUM chart
ax2 = axes[1]
ax2.plot(monitor_data.index, cusum_pos, color=COLORS['green'], lw=1.5, label='CUSUM+ (positive shift)')
ax2.plot(monitor_data.index, cusum_neg, color=COLORS['red'], lw=1.5, label='CUSUM- (negative shift)')
ax2.axhline(4.0, color=COLORS['gray'], ls='--', lw=1.5, label='Alert Threshold')

ax2.set_xlabel('Date')
ax2.set_ylabel('CUSUM Value')
ax2.set_title('CUSUM Control Chart for Concept Drift', fontsize=13, fontweight='bold')
ax2.legend(loc='upper left', frameon=False)

fig.suptitle('Concept Drift Detection using CUSUM', fontsize=15, fontweight='bold', y=1.01)
fig.tight_layout()
plt.show()

## 4. Feature Importance Stability

Monitor how feature importances change over time.
Large shifts indicate the model may be relying on unstable relationships.

In [None]:
# Calculate feature importance for different time periods
def get_importance_for_period(X, y):
    """Train RF and return feature importances."""
    rf = RandomForestRegressor(n_estimators=50, max_depth=6, random_state=42, n_jobs=-1)
    rf.fit(X, y)
    return pd.Series(rf.feature_importances_, index=X.columns)

# Training period importance
imp_train = get_importance_for_period(X_train, y_train)

# Importance for each monitoring quarter
imp_quarters = {}
for q_name, q_data in quarters:
    X_q = q_data[feature_cols]
    y_q = q_data['target']
    imp_quarters[q_name] = get_importance_for_period(X_q, y_q)

# Combine into dataframe
imp_df = pd.DataFrame({'Training': imp_train})
for q_name, imp in imp_quarters.items():
    imp_df[q_name] = imp

# Calculate importance rank changes
rank_train = imp_df['Training'].rank(ascending=False)
rank_q4 = imp_df['Q4'].rank(ascending=False)
rank_change = (rank_train - rank_q4).abs()

print("Feature Importance Stability (rank change from Training to Q4):")
print(rank_change.sort_values(ascending=False).head(10))

In [None]:
# Visualize feature importance evolution
top_features = imp_df['Training'].nlargest(8).index.tolist()
imp_subset = imp_df.loc[top_features]

fig, ax = plt.subplots(figsize=(12, 7))

x = np.arange(len(imp_subset.columns))
width = 0.10

colors = [COLORS['blue'], COLORS['green'], COLORS['orange'], COLORS['red'], COLORS['purple']]

for i, feat in enumerate(top_features):
    offset = (i - len(top_features)/2) * width
    bars = ax.bar(x + offset, imp_subset.loc[feat].values, width, label=feat, alpha=0.85)

ax.set_xticks(x)
ax.set_xticklabels(imp_subset.columns)
ax.set_xlabel('Period')
ax.set_ylabel('Feature Importance')
ax.set_title('Feature Importance Evolution Over Time', fontsize=14, fontweight='bold')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.12), frameon=False, ncol=4)

plt.tight_layout()
plt.show()

## 5. Recalibration Trigger Summary

Combine all monitoring signals into a dashboard for decision-making.

In [None]:
# Calculate summary metrics for each quarter
summary_results = []

for q_name, q_data in quarters:
    q_idx = q_data.index
    q_mask = monitor_data.index.isin(q_idx)
    q_errors = errors[q_mask]
    
    # Performance metrics
    q_mae = np.mean(np.abs(q_errors))
    q_rmse = np.sqrt(np.mean(q_errors**2))
    
    # Drift metrics
    psi_vals = drift_df[drift_df['Period'] == q_name]['PSI'].values
    max_psi = psi_vals.max() if len(psi_vals) > 0 else 0
    
    # CUSUM alerts in this quarter
    q_alerts = drift_alerts[q_mask].sum()
    
    # Performance degradation
    mae_ratio = q_mae / baseline_mae
    
    # Recalibration trigger logic
    triggers = []
    if mae_ratio > 1.5:
        triggers.append('Performance')
    if max_psi > 0.2:
        triggers.append('Data Drift')
    if q_alerts > len(q_errors) * 0.1:
        triggers.append('Concept Drift')
    
    summary_results.append({
        'Period': q_name,
        'Date Range': f"{q_idx.min().strftime('%Y-%m')} to {q_idx.max().strftime('%Y-%m')}",
        'MAE': q_mae,
        'MAE Ratio': mae_ratio,
        'Max PSI': max_psi,
        'CUSUM Alerts': q_alerts,
        'Triggers': ', '.join(triggers) if triggers else 'None',
        'Action': 'RETRAIN' if triggers else 'Monitor'
    })

summary_df = pd.DataFrame(summary_results)
print("\n" + "="*90)
print("   MODEL MONITORING SUMMARY")
print("="*90)
print(summary_df.to_string(index=False))
print("="*90)

In [None]:
# Final monitoring dashboard visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

periods = summary_df['Period'].tolist()
x = np.arange(len(periods))

# Color by action
action_colors = [COLORS['red'] if a == 'RETRAIN' else COLORS['green'] for a in summary_df['Action']]

# Panel 1: MAE Ratio
ax1 = axes[0, 0]
bars = ax1.bar(x, summary_df['MAE Ratio'], color=action_colors, alpha=0.85, edgecolor='white')
ax1.axhline(1.0, color=COLORS['green'], ls='--', lw=1.5, label='Baseline')
ax1.axhline(1.5, color=COLORS['red'], ls='--', lw=1.5, label='Alert Threshold')
ax1.set_xticks(x)
ax1.set_xticklabels(periods)
ax1.set_ylabel('MAE Ratio (vs Training)')
ax1.set_title('Performance Degradation', fontsize=13, fontweight='bold')
ax1.legend(loc='upper right', frameon=False)

# Panel 2: Max PSI
ax2 = axes[0, 1]
bars = ax2.bar(x, summary_df['Max PSI'], color=action_colors, alpha=0.85, edgecolor='white')
ax2.axhline(0.1, color=COLORS['orange'], ls='--', lw=1.5, label='Monitor')
ax2.axhline(0.2, color=COLORS['red'], ls='--', lw=1.5, label='Alert')
ax2.set_xticks(x)
ax2.set_xticklabels(periods)
ax2.set_ylabel('Maximum PSI')
ax2.set_title('Data Drift (PSI)', fontsize=13, fontweight='bold')
ax2.legend(loc='upper right', frameon=False)

# Panel 3: CUSUM Alerts
ax3 = axes[1, 0]
bars = ax3.bar(x, summary_df['CUSUM Alerts'], color=action_colors, alpha=0.85, edgecolor='white')
ax3.set_xticks(x)
ax3.set_xticklabels(periods)
ax3.set_ylabel('Number of Alerts')
ax3.set_title('Concept Drift Alerts', fontsize=13, fontweight='bold')

# Panel 4: Action Summary (traffic light)
ax4 = axes[1, 1]
ax4.axis('off')

# Create text summary
summary_text = "MODEL HEALTH SUMMARY\n" + "="*35 + "\n\n"
for _, row in summary_df.iterrows():
    status = "OK" if row['Action'] == 'Monitor' else "ALERT"
    symbol = "[OK]" if status == "OK" else "[!!]"
    summary_text += f"{symbol} {row['Period']}: {row['Action']}\n"
    if row['Triggers'] != 'None':
        summary_text += f"    Triggers: {row['Triggers']}\n"

summary_text += "\n" + "="*35 + "\n"
summary_text += "\nRecalibration Thresholds:\n"
summary_text += "  - MAE Ratio > 1.5\n"
summary_text += "  - Max PSI > 0.2\n"
summary_text += "  - CUSUM Alerts > 10%\n"

ax4.text(0.1, 0.95, summary_text, transform=ax4.transAxes, fontsize=11,
         verticalalignment='top', fontfamily='monospace',
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.9, edgecolor=COLORS['gray']))

fig.suptitle('Model Monitoring Dashboard', fontsize=15, fontweight='bold', y=1.01)
fig.tight_layout()
save_fig(fig, 'ml_monitor.pdf')
plt.show()

In [None]:
# Final recommendations
print("\n" + "="*70)
print("   MONITORING RECOMMENDATIONS")
print("="*70)
print("""
1. PERFORMANCE MONITORING
   - Track rolling MAE/RMSE with 30-day windows
   - Alert when MAE exceeds 1.5x training baseline
   - Monitor prediction bias for systematic errors

2. DATA DRIFT DETECTION
   - Calculate PSI for key features weekly
   - PSI > 0.2 indicates significant distribution shift
   - Use KS-test for additional validation

3. CONCEPT DRIFT DETECTION
   - Implement CUSUM control charts on prediction errors
   - Monitor feature importance stability
   - Track residual distribution changes

4. RECALIBRATION TRIGGERS
   - Automatic retrain when any threshold is breached
   - Minimum retrain frequency: monthly
   - Maximum retrain frequency: weekly (avoid overfitting to noise)

5. BEST PRACTICES FOR ENERGY MARKETS
   - Expect drift during seasonal transitions
   - Monitor market regime changes (high volatility periods)
   - Consider rolling retraining with expanding windows
""")
print("="*70)