# üìà Model Evaluation for Traffic Flow Optimization

**Phase 3: Modeling, Analysis, and Evaluation**

## Overview
This notebook evaluates trained models using comprehensive metrics:
- MAE, RMSE, MAPE, R¬≤
- Cross-validation performance
- Model comparison
- Visualization of predictions vs actuals

**Target Metrics:**
- MAE < 5.0
- RMSE < 8.0
- MAPE < 15%
- R¬≤ > 0.75

---

**Author:** Data Science Team  
**Date:** November 2025  
**Project:** Bangkok Traffic Flow Optimization (CPE312 Capstone)

In [1]:
# Setup and Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

# Add T3 scripts to path
import sys
sys.path.append('../05_Scripts')

# Import custom modules
from evaluation import (
    calculate_all_metrics,
    compare_models,
    generate_evaluation_report,
    calculate_improvement_over_baseline
)
from model_utils import load_model

# Display settings
pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-v0_8-whitegrid')

print("‚úÖ Setup complete!")

‚úÖ Setup complete!


## 1. Load Data and Models

In [2]:
# Define paths
DATA_PATH = Path('../02_Data/Processed/')
MODEL_PATH = Path('../02_Model_Development/Trained_Models/')
RESULTS_PATH = Path('../09_Results/')
RESULTS_PATH.mkdir(parents=True, exist_ok=True)
(RESULTS_PATH / 'Figures').mkdir(parents=True, exist_ok=True)

# Load test data
df = pd.read_csv(DATA_PATH / 'features_engineered.csv')
target_col = 'congestion_index'

# Select only numeric columns for features (exclude date and target)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [col for col in numeric_cols if col != target_col]

# Handle NaN values
df[feature_cols] = df[feature_cols].fillna(0)

# Split to get test set (last 20%)
n = len(df)
test_start = int(n * 0.8)
test_df = df.iloc[test_start:]

X_test = test_df[feature_cols].values
y_test = test_df[target_col].values

print(f"Test set size: {len(y_test)}")
print(f"Features: {len(feature_cols)}")

# Load trained models
models = {}
for model_file in MODEL_PATH.glob('*.pkl'):
    model_name = model_file.stem.replace('_model', '')
    models[model_name] = load_model(str(model_file))
    print(f"‚úÖ Loaded: {model_name}")

print(f"\nTotal models loaded: {len(models)}")

Test set size: 331
Features: 33


INFO:model_utils:Model loaded from ../02_Model_Development/Trained_Models/extra_trees_tuned.pkl
INFO:model_utils:Model loaded from ../02_Model_Development/Trained_Models/scaler.pkl
INFO:model_utils:Model loaded from ../02_Model_Development/Trained_Models/random_forest_model.pkl
INFO:model_utils:Model loaded from ../02_Model_Development/Trained_Models/xgboost_tuned.pkl
INFO:model_utils:Model loaded from ../02_Model_Development/Trained_Models/gradient_boosting_tuned.pkl


‚úÖ Loaded: extra_trees_tuned
‚úÖ Loaded: scaler
‚úÖ Loaded: random_forest
‚úÖ Loaded: xgboost_tuned
‚úÖ Loaded: gradient_boosting_tuned


INFO:model_utils:Model loaded from ../02_Model_Development/Trained_Models/arima_model.pkl
INFO:model_utils:Model loaded from ../02_Model_Development/Trained_Models/random_forest_tuned.pkl
INFO:model_utils:Model loaded from ../02_Model_Development/Trained_Models/xgboost_model.pkl


‚úÖ Loaded: arima
‚úÖ Loaded: random_forest_tuned
‚úÖ Loaded: xgboost

Total models loaded: 8


## 2. Evaluate Models on Test Set

In [3]:
# Evaluate each model
results = {}
predictions = {}

for name, model in models.items():
    print(f"\nEvaluating: {name}")
    
    # Get predictions
    if hasattr(model, 'predict'):
        y_pred = model.predict(X_test)
    elif hasattr(model, 'forecast'):
        y_pred = model.forecast(len(y_test))
    else:
        print(f"  ‚ö†Ô∏è Model has no predict method")
        continue
    
    predictions[name] = y_pred
    
    # Calculate metrics
    metrics = calculate_all_metrics(y_test, y_pred)
    results[name] = metrics
    
    print(f"  MAE:  {metrics['MAE']:.4f}")
    print(f"  RMSE: {metrics['RMSE']:.4f}")
    print(f"  MAPE: {metrics['MAPE']:.2f}%")
    print(f"  R¬≤:   {metrics['R2']:.4f}")


Evaluating: extra_trees_tuned


ValueError: X has 33 features, but ExtraTreesRegressor is expecting 35 features as input.

## 3. Model Comparison

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame(results).T
comparison_df = comparison_df.round(4)
comparison_df = comparison_df.sort_values('RMSE')

print("=" * 70)
print("MODEL COMPARISON (sorted by RMSE)")
print("=" * 70)
print(comparison_df)

# Save to CSV
comparison_df.to_csv(RESULTS_PATH / 'model_comparison.csv')
print(f"\n‚úÖ Comparison saved to: {RESULTS_PATH / 'model_comparison.csv'}")

## 4. Visualize Results

In [None]:
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. RMSE comparison bar chart
ax1 = axes[0, 0]
comparison_df['RMSE'].plot(kind='bar', ax=ax1, color='steelblue')
ax1.axhline(y=8.0, color='red', linestyle='--', label='Target (8.0)')
ax1.set_title('RMSE by Model')
ax1.set_ylabel('RMSE')
ax1.legend()
ax1.tick_params(axis='x', rotation=45)

# 2. R¬≤ comparison bar chart
ax2 = axes[0, 1]
comparison_df['R2'].plot(kind='bar', ax=ax2, color='forestgreen')
ax2.axhline(y=0.75, color='red', linestyle='--', label='Target (0.75)')
ax2.set_title('R¬≤ by Model')
ax2.set_ylabel('R¬≤')
ax2.legend()
ax2.tick_params(axis='x', rotation=45)

# 3. Predictions vs Actuals (best model)
ax3 = axes[1, 0]
best_model = comparison_df['RMSE'].idxmin()
if best_model in predictions:
    ax3.plot(y_test[:100], label='Actual', color='blue', alpha=0.7)
    ax3.plot(predictions[best_model][:100], label=f'{best_model} Prediction', color='orange', alpha=0.7)
    ax3.set_title(f'Predictions vs Actuals ({best_model})')
    ax3.set_xlabel('Time')
    ax3.set_ylabel('Value')
    ax3.legend()

# 4. Error distribution
ax4 = axes[1, 1]
if best_model in predictions:
    errors = y_test - predictions[best_model]
    ax4.hist(errors, bins=30, color='purple', alpha=0.7, edgecolor='black')
    ax4.axvline(x=0, color='red', linestyle='--')
    ax4.set_title(f'Error Distribution ({best_model})')
    ax4.set_xlabel('Prediction Error')
    ax4.set_ylabel('Frequency')

plt.tight_layout()
plt.savefig(RESULTS_PATH / 'Figures' / 'model_evaluation.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n‚úÖ Visualization saved to: {RESULTS_PATH / 'Figures' / 'model_evaluation.png'}")

## 5. Evaluation Summary

In [None]:
# Final Summary
print("=" * 70)
print("MODEL EVALUATION COMPLETE")
print("=" * 70)

best_model = comparison_df['RMSE'].idxmin()
best_metrics = results[best_model]

print(f"\nüèÜ Best Model: {best_model}")
print(f"\nPerformance Metrics:")
print(f"  MAE:  {best_metrics['MAE']:.4f} {'‚úÖ' if best_metrics['MAE'] < 5.0 else '‚ö†Ô∏è'}")
print(f"  RMSE: {best_metrics['RMSE']:.4f} {'‚úÖ' if best_metrics['RMSE'] < 8.0 else '‚ö†Ô∏è'}")
print(f"  MAPE: {best_metrics['MAPE']:.2f}% {'‚úÖ' if best_metrics['MAPE'] < 15 else '‚ö†Ô∏è'}")
print(f"  R¬≤:   {best_metrics['R2']:.4f} {'‚úÖ' if best_metrics['R2'] > 0.75 else '‚ö†Ô∏è'}")

print("\n" + "=" * 70)
print("Next Step: Run 07_Model_Interpretation.ipynb")
print("=" * 70)