# Backtesting Analysis

This notebook analyzes the results from rolling-origin backtesting on the validation set.

## Contents
1. Load backtesting results
2. Model performance comparison
3. Metrics by fold
4. Error analysis by horizon
5. Probabilistic forecast evaluation
6. Statistical significance tests

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

from src.utils import load_metrics, load_yaml
from src.plots import (
    plot_metrics_by_fold,
    plot_error_by_horizon
)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

%matplotlib inline

## 1. Load Backtesting Results

In [None]:
# Load all model metrics
models = ['seasonal_naive', 'ets', 'gradient_boosting', 'chronos']

all_metrics = {}
for model in models:
    try:
        metrics = load_metrics(model, '../artifacts/metrics')
        all_metrics[model] = metrics
        print(f"[OK] Loaded {model}")
    except Exception as e:
        print(f"[FAIL] Failed to load {model}: {e}")

## 2. Model Performance Comparison

In [None]:
# Create metrics comparison table
metrics_df = pd.DataFrame(all_metrics).T

# Select key metrics
key_metrics = ['mae_mean', 'rmse_mean', 'smape_mean', 'mase_mean']
display_df = metrics_df[key_metrics].copy()

# Sort by MASE
display_df = display_df.sort_values('mase_mean')

print("\n=== Validation Performance (Mean across folds) ===")
print(display_df.to_string())
print("\nBest model (by MASE):", display_df.index[0])

In [None]:
# Visualize metrics comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

metric_names = ['mae_mean', 'rmse_mean', 'smape_mean', 'mase_mean']
titles = ['MAE', 'RMSE', 'sMAPE (%)', 'MASE']

for idx, (metric, title) in enumerate(zip(metric_names, titles)):
    ax = axes[idx // 2, idx % 2]
    
    values = [all_metrics[model][metric] for model in models if metric in all_metrics[model]]
    model_names = [model for model in models if metric in all_metrics[model]]
    
    ax.bar(model_names, values, edgecolor='black')
    ax.set_title(f'{title} Comparison', fontweight='bold')
    ax.set_ylabel(title)
    ax.tick_params(axis='x', rotation=45)
    ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 3. Load Fold-Level Results

In [None]:
# Load backtesting predictions
backtest_dfs = {}

for model in models:
    try:
        df = pd.read_parquet(f'../artifacts/predictions/{model}_backtest.parquet')
        backtest_dfs[model] = df
        print(f"[OK] Loaded {model} predictions: {len(df)} records")
    except Exception as e:
        print(f"[FAIL] Failed to load {model}: {e}")

## 4. Error Analysis by Horizon

In [None]:
# Compute MAE by horizon for each model
error_by_horizon = {}

for model, df in backtest_dfs.items():
    horizon_errors = df.groupby('horizon').apply(
        lambda x: np.mean(np.abs(x['y_true'] - x['y_pred']))
    ).to_dict()
    error_by_horizon[model] = horizon_errors

# Plot
plot_error_by_horizon(error_by_horizon, metric_name='MAE')

## 5. Statistical Significance Tests

In [None]:
# Load statistical test results
try:
    stats_df = pd.read_csv('../artifacts/metrics/statistical_tests.csv')
    print("\n=== Statistical Significance Tests ===")
    print(stats_df.to_string(index=False))
except Exception as e:
    print(f"Statistical tests not available: {e}")

## 6. Probabilistic Forecast Evaluation (Chronos)

In [None]:
# Check if quantile predictions are available
if 'chronos' in backtest_dfs:
    chronos_df = backtest_dfs['chronos']
    
    # Check for quantile columns
    quantile_cols = [col for col in chronos_df.columns if col.startswith('q_')]
    
    if quantile_cols:
        print(f"\nQuantile predictions available: {quantile_cols}")
        
        # Compute coverage
        if 'q_0.1' in chronos_df.columns and 'q_0.9' in chronos_df.columns:
            coverage_80 = np.mean(
                (chronos_df['y_true'] >= chronos_df['q_0.1']) & 
                (chronos_df['y_true'] <= chronos_df['q_0.9'])
            )
            print(f"\n80% Prediction Interval Coverage: {coverage_80*100:.2f}%")
            print(f"  Expected: 80%")
            print(f"  {'Well-calibrated!' if abs(coverage_80 - 0.8) < 0.05 else 'Needs calibration'}")
    else:
        print("\nNo quantile predictions found in backtest results")
else:
    print("\nChronos results not available")

## Summary

Key findings from backtesting:
1. Best performing model (by MASE)
2. Error increases with forecast horizon
3. Statistical significance of model differences
4. Calibration quality of probabilistic forecasts