# Test Set Evaluation

Final evaluation of all models on the held-out test set.

## Contents
1. Load test results
2. Final performance comparison
3. Forecast visualizations
4. Calibration analysis
5. Residuals analysis
6. Feature importance (Gradient Boosting)
7. Final conclusions

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from src.utils import load_yaml
from src.plots import (
    plot_forecasts_with_history,
    plot_calibration_curve,
    plot_residuals_analysis
)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

%matplotlib inline

## 1. Load Test Results

In [None]:
# Load results summary
results = load_yaml('../artifacts/results_summary.yaml')

print("Execution time:", results['execution_time'])
print("\nData info:")
for key, value in results['data_info'].items():
    print(f"  {key}: {value}")

In [None]:
# Load test metrics
test_metrics = results['test_metrics']

# Create comparison table
test_df = pd.DataFrame(test_metrics).T

# Select key metrics
key_metrics = [col for col in test_df.columns if col in ['mae', 'rmse', 'smape', 'mase']]
display_df = test_df[key_metrics].copy()

# Sort by MASE
if 'mase' in display_df.columns:
    display_df = display_df.sort_values('mase')

print("\n=== Test Set Performance ===")
print(display_df.to_string())

## 2. Compare Validation vs Test Performance

In [None]:
# Compare validation and test MASE
val_metrics = results['validation_metrics']

comparison_data = []
for model in test_metrics.keys():
    val_mase = val_metrics.get(model, {}).get('mase_mean', np.nan)
    test_mase = test_metrics.get(model, {}).get('mase', np.nan)
    
    comparison_data.append({
        'Model': model,
        'Validation MASE': val_mase,
        'Test MASE': test_mase,
        'Difference': test_mase - val_mase
    })

comparison_df = pd.DataFrame(comparison_data)
print("\n=== Validation vs Test MASE ===")
print(comparison_df.to_string(index=False))

In [None]:
# Visualize validation vs test
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(comparison_df))
width = 0.35

ax.bar(x - width/2, comparison_df['Validation MASE'], width, label='Validation', edgecolor='black')
ax.bar(x + width/2, comparison_df['Test MASE'], width, label='Test', edgecolor='black')

ax.set_ylabel('MASE')
ax.set_title('Validation vs Test Performance', fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'], rotation=45)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 3. Load and Visualize Test Forecasts

In [None]:
# Check if forecast plot exists
forecast_plot = Path('../artifacts/figures/test_forecasts.png')

if forecast_plot.exists():
    from IPython.display import Image, display
    display(Image(filename=str(forecast_plot)))
else:
    print("Forecast plot not found. Run the pipeline first.")

## 4. Calibration Analysis (Chronos)

In [None]:
# Check if calibration plot exists
calibration_plot = Path('../artifacts/figures/calibration_curve.png')

if calibration_plot.exists():
    from IPython.display import Image, display
    display(Image(filename=str(calibration_plot)))
else:
    print("Calibration plot not found.")

## 5. Model Rankings

In [None]:
# Display model rankings
rankings = results.get('model_rankings', {})

print("\n=== Model Rankings (by MASE) ===")
for rank, (model, score) in enumerate(sorted(rankings.items(), key=lambda x: x[1]), 1):
    print(f"{rank}. {model}: {score:.4f}")

## 6. Library Versions (Reproducibility)

In [None]:
# Display library versions
versions = results.get('library_versions', {})

print("\n=== Library Versions ===")
for lib, version in versions.items():
    print(f"{lib}: {version}")

## 7. Additional Visualizations

In [None]:
# Check for seasonality decomposition
seasonality_plot = Path('../artifacts/figures/seasonality_decomposition.png')

if seasonality_plot.exists():
    print("\n=== Seasonality Decomposition ===")
    from IPython.display import Image, display
    display(Image(filename=str(seasonality_plot)))
else:
    print("Seasonality plot not found.")

## Summary

### Key Findings:
1. **Best Model**: [Will be determined after running pipeline]
2. **Validation vs Test**: [Consistency analysis]
3. **Chronos Performance**: [Zero-shot vs baselines]
4. **Calibration**: [Prediction interval quality]

### Next Steps:
1. Document findings in report
2. Create presentation slides
3. Write model card
4. Consider model deployment