In [18]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import warnings
import os

warnings.filterwarnings('ignore')

# Add src to path
import sys
sys.path.append('..')

from src.config import (
    SEQUENCES_DIR, BEST_MODEL_PATH, SCALER_PATH,
    RESULTS_DIR, EVAL_FIGURES_DIR, METRICS_PATH, PREDICTIONS_PATH,
    INPUT_SEQ_LEN, OUTPUT_SEQ_LEN,
    ENCODER_HIDDEN_SIZE, ENCODER_NUM_LAYERS, ENCODER_DROPOUT, ENCODER_BIDIRECTIONAL,
    BATCH_SIZE, DEVICE
)
from src.dataset import create_dataloaders
from src.model import build_model
from src.evaluate import (
    predict, calculate_metrics, calculate_metrics_per_step,
    evaluate_model, print_evaluation_report
)
from src.utils import load_pickle, load_json, save_json, save_figure, save_csv

print("Libraries imported successfully!")
print(f"Device: {DEVICE}")

Libraries imported successfully!
Device: cuda


## 7.1 Load Model and Data

In [19]:
# Load test data
X_test = np.load(os.path.join(SEQUENCES_DIR, 'X_test.npy'))
y_test = np.load(os.path.join(SEQUENCES_DIR, 'y_test.npy'))

# Load metadata
metadata = load_json(os.path.join(SEQUENCES_DIR, 'metadata.json'))
n_features = metadata['n_features']
target_idx = metadata['target_idx']

print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"Number of features: {n_features}")
print(f"Target index: {target_idx}")

X_test shape: (4157, 24, 22)
y_test shape: (4157, 5)
Number of features: 22
Target index: 0


In [20]:
# Load scaler
scaler = load_pickle(SCALER_PATH)
print("Scaler loaded!")

Scaler loaded!


In [21]:
# Build model architecture (same as training)
model = build_model(
    input_size=n_features,
    hidden_size=ENCODER_HIDDEN_SIZE,
    num_layers=ENCODER_NUM_LAYERS,
    dropout=ENCODER_DROPOUT,
    bidirectional=ENCODER_BIDIRECTIONAL,
    output_seq_len=OUTPUT_SEQ_LEN,
    device=DEVICE
)

# Load trained weights
checkpoint = torch.load(BEST_MODEL_PATH, map_location=DEVICE)
model.load_state_dict(checkpoint['model_state_dict'])
print(f"Model loaded from epoch {checkpoint['epoch'] + 1}")
print(f"Validation loss at checkpoint: {checkpoint['val_loss']:.6f}")

Model built on cuda
Total parameters: 816,001
Trainable parameters: 816,001
Model loaded from epoch 21
Validation loss at checkpoint: 0.002864


In [22]:
# Create test DataLoader
from src.dataset import TimeSeriesDataset
from torch.utils.data import DataLoader

test_dataset = TimeSeriesDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Test batches: {len(test_loader)}")

Test batches: 65


## 7.2 Generate Predictions

In [23]:
# Generate predictions
y_true_scaled, y_pred_scaled = predict(model, test_loader, DEVICE)

print(f"y_true shape: {y_true_scaled.shape}")
print(f"y_pred shape: {y_pred_scaled.shape}")

y_true shape: (4157, 5)
y_pred shape: (4157, 5)


In [24]:
# Inverse transform to original scale
n_samples, n_steps = y_true_scaled.shape

y_true_original = np.zeros_like(y_true_scaled)
y_pred_original = np.zeros_like(y_pred_scaled)

for i in range(n_steps):
    # Create dummy array with zeros
    dummy = np.zeros((n_samples, n_features))
    
    # Put true values in target column and inverse transform
    dummy[:, target_idx] = y_true_scaled[:, i]
    y_true_original[:, i] = scaler.inverse_transform(dummy)[:, target_idx]
    
    # Put predicted values in target column and inverse transform
    dummy[:, target_idx] = y_pred_scaled[:, i]
    y_pred_original[:, i] = scaler.inverse_transform(dummy)[:, target_idx]

print("Inverse transform completed!")
print(f"Traffic volume range: {y_true_original.min():.0f} - {y_true_original.max():.0f}")

Inverse transform completed!
Traffic volume range: 151 - 7213


## 7.3 Calculate Metrics

In [25]:
# Calculate metrics per step
step_names = [f't+{i+1}' for i in range(OUTPUT_SEQ_LEN)]
metrics_df = calculate_metrics_per_step(y_true_original, y_pred_original, step_names)

print("\nMetrics per Prediction Step:")
print(metrics_df.to_string(index=False))


Metrics per Prediction Step:
   Step       R2      NSE        MAE       RMSE
    t+1 0.984139 0.984139 178.903900 249.597543
    t+2 0.978307 0.978307 200.402390 291.668985
    t+3 0.972042 0.972042 211.756439 330.979548
    t+4 0.966494 0.966494 220.733551 362.328602
    t+5 0.961454 0.961454 233.685226 388.839499
Average 0.972492 0.972492 209.096313 328.449482


In [26]:
# Print detailed evaluation report
print_evaluation_report(metrics_df)


EVALUATION REPORT

Metrics per Prediction Step:
------------------------------------------------------------
   Step     R2    NSE      MAE     RMSE
    t+1 0.9841 0.9841 178.9039 249.5975
    t+2 0.9783 0.9783 200.4024 291.6690
    t+3 0.9720 0.9720 211.7564 330.9795
    t+4 0.9665 0.9665 220.7336 362.3286
    t+5 0.9615 0.9615 233.6852 388.8395
Average 0.9725 0.9725 209.0963 328.4495
------------------------------------------------------------

Interpretation:
  - Average R²: 0.9725 (Excellent)
  - Average NSE: 0.9725 (Very Good)


## 7.4 Visualization

In [27]:
# Create figures directory
os.makedirs(EVAL_FIGURES_DIR, exist_ok=True)

In [28]:
# 1. Metrics by forecast horizon
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics_plot = metrics_df[metrics_df['Step'] != 'Average']
x = range(len(metrics_plot))

# R²
axes[0, 0].bar(x, metrics_plot['R2'], color='steelblue')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(metrics_plot['Step'])
axes[0, 0].set_ylabel('R²')
axes[0, 0].set_title('R² by Forecast Horizon')
axes[0, 0].axhline(y=metrics_df[metrics_df['Step']=='Average']['R2'].values[0], color='red', linestyle='--', label='Average')
axes[0, 0].legend()

# NSE
axes[0, 1].bar(x, metrics_plot['NSE'], color='coral')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(metrics_plot['Step'])
axes[0, 1].set_ylabel('NSE')
axes[0, 1].set_title('NSE by Forecast Horizon')
axes[0, 1].axhline(y=metrics_df[metrics_df['Step']=='Average']['NSE'].values[0], color='red', linestyle='--', label='Average')
axes[0, 1].legend()

# MAE
axes[1, 0].bar(x, metrics_plot['MAE'], color='seagreen')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(metrics_plot['Step'])
axes[1, 0].set_ylabel('MAE')
axes[1, 0].set_title('MAE by Forecast Horizon')
axes[1, 0].axhline(y=metrics_df[metrics_df['Step']=='Average']['MAE'].values[0], color='red', linestyle='--', label='Average')
axes[1, 0].legend()

# RMSE
axes[1, 1].bar(x, metrics_plot['RMSE'], color='purple')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(metrics_plot['Step'])
axes[1, 1].set_ylabel('RMSE')
axes[1, 1].set_title('RMSE by Forecast Horizon')
axes[1, 1].axhline(y=metrics_df[metrics_df['Step']=='Average']['RMSE'].values[0], color='red', linestyle='--', label='Average')
axes[1, 1].legend()

plt.tight_layout()
save_figure(fig, os.path.join(EVAL_FIGURES_DIR, 'metrics_by_horizon.png'))
plt.show()

Saved: d:\DeepLearning_final\results\figures\evaluation\metrics_by_horizon.png


In [29]:
# 2. Actual vs Predicted scatter plots
fig, axes = plt.subplots(1, OUTPUT_SEQ_LEN, figsize=(4*OUTPUT_SEQ_LEN, 4))

for i in range(OUTPUT_SEQ_LEN):
    ax = axes[i] if OUTPUT_SEQ_LEN > 1 else axes
    ax.scatter(y_true_original[:, i], y_pred_original[:, i], alpha=0.3, s=10)
    
    # Perfect prediction line
    min_val = min(y_true_original[:, i].min(), y_pred_original[:, i].min())
    max_val = max(y_true_original[:, i].max(), y_pred_original[:, i].max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect')
    
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    ax.set_title(f't+{i+1}')
    ax.legend()

plt.suptitle('Actual vs Predicted Traffic Volume', y=1.02)
plt.tight_layout()
save_figure(fig, os.path.join(EVAL_FIGURES_DIR, 'scatter_plots.png'))
plt.show()

Saved: d:\DeepLearning_final\results\figures\evaluation\scatter_plots.png


In [30]:
# 3. Time series comparison (sample)
n_samples_plot = 200

fig, axes = plt.subplots(OUTPUT_SEQ_LEN, 1, figsize=(16, 3*OUTPUT_SEQ_LEN))

for i in range(OUTPUT_SEQ_LEN):
    ax = axes[i] if OUTPUT_SEQ_LEN > 1 else axes
    ax.plot(y_true_original[:n_samples_plot, i], 'b-', label='Actual', alpha=0.7)
    ax.plot(y_pred_original[:n_samples_plot, i], 'r-', label='Predicted', alpha=0.7)
    ax.set_xlabel('Sample')
    ax.set_ylabel('Traffic Volume')
    ax.set_title(f'Prediction Step t+{i+1}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
save_figure(fig, os.path.join(EVAL_FIGURES_DIR, 'time_series_comparison.png'))
plt.show()

Saved: d:\DeepLearning_final\results\figures\evaluation\time_series_comparison.png


In [31]:
# 4. Error distribution
errors = y_pred_original - y_true_original

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Error histogram
axes[0].hist(errors.flatten(), bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='red', linestyle='--')
axes[0].set_xlabel('Prediction Error')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Error Distribution (All Steps)')

# Error by step (boxplot)
axes[1].boxplot([errors[:, i] for i in range(OUTPUT_SEQ_LEN)], labels=step_names)
axes[1].axhline(y=0, color='red', linestyle='--')
axes[1].set_xlabel('Prediction Step')
axes[1].set_ylabel('Prediction Error')
axes[1].set_title('Error Distribution by Step')

plt.tight_layout()
save_figure(fig, os.path.join(EVAL_FIGURES_DIR, 'error_distribution.png'))
plt.show()

Saved: d:\DeepLearning_final\results\figures\evaluation\error_distribution.png


## 7.5 Save Results

In [32]:
# Save metrics to JSON
metrics_dict = {
    'per_step': metrics_df.to_dict('records'),
    'summary': {
        'avg_R2': float(metrics_df[metrics_df['Step']=='Average']['R2'].values[0]),
        'avg_NSE': float(metrics_df[metrics_df['Step']=='Average']['NSE'].values[0]),
        'avg_MAE': float(metrics_df[metrics_df['Step']=='Average']['MAE'].values[0]),
        'avg_RMSE': float(metrics_df[metrics_df['Step']=='Average']['RMSE'].values[0])
    }
}

save_json(metrics_dict, METRICS_PATH)
print(f"Metrics saved to: {METRICS_PATH}")

Saved: d:\DeepLearning_final\results\metrics.json
Metrics saved to: d:\DeepLearning_final\results\metrics.json


In [33]:
# Save predictions to CSV
predictions_df = pd.DataFrame()

for i in range(OUTPUT_SEQ_LEN):
    predictions_df[f'actual_t+{i+1}'] = y_true_original[:, i]
    predictions_df[f'predicted_t+{i+1}'] = y_pred_original[:, i]

save_csv(predictions_df, PREDICTIONS_PATH, index=False)
print(f"Predictions saved to: {PREDICTIONS_PATH}")

Saved: d:\DeepLearning_final\results\predictions.csv
Predictions saved to: d:\DeepLearning_final\results\predictions.csv


## 7.6 Final Summary

In [34]:
# Create final summary table
print("\n" + "=" * 70)
print("FINAL EVALUATION RESULTS")
print("=" * 70)
print("\nMetrics Summary Table:")
print("-" * 70)
print(f"{'Step':<10} {'R²':>10} {'NSE':>10} {'MAE':>12} {'RMSE':>12}")
print("-" * 70)

for _, row in metrics_df.iterrows():
    print(f"{row['Step']:<10} {row['R2']:>10.4f} {row['NSE']:>10.4f} {row['MAE']:>12.2f} {row['RMSE']:>12.2f}")

print("-" * 70)
print("\nInterpretation:")
print(f"  • R² (Coefficient of Determination): How well the model explains variance")
print(f"  • NSE (Nash-Sutcliffe Efficiency): Model performance relative to mean")
print(f"  • MAE (Mean Absolute Error): Average prediction error in traffic units")
print(f"  • RMSE (Root Mean Squared Error): Penalizes large errors more")
print("=" * 70)


FINAL EVALUATION RESULTS

Metrics Summary Table:
----------------------------------------------------------------------
Step               R²        NSE          MAE         RMSE
----------------------------------------------------------------------
t+1            0.9841     0.9841       178.90       249.60
t+2            0.9783     0.9783       200.40       291.67
t+3            0.9720     0.9720       211.76       330.98
t+4            0.9665     0.9665       220.73       362.33
t+5            0.9615     0.9615       233.69       388.84
Average        0.9725     0.9725       209.10       328.45
----------------------------------------------------------------------

Interpretation:
  • R² (Coefficient of Determination): How well the model explains variance
  • NSE (Nash-Sutcliffe Efficiency): Model performance relative to mean
  • MAE (Mean Absolute Error): Average prediction error in traffic units
  • RMSE (Root Mean Squared Error): Penalizes large errors more


## Summary

**Evaluation completed:**
1. ✅ Loaded trained model
2. ✅ Generated predictions on test set
3. ✅ Inverse transformed to original scale
4. ✅ Calculated metrics (R², NSE, MAE, RMSE)
5. ✅ Visualized results
6. ✅ Saved metrics and predictions

**Model Performance:**
- Metrics calculated for each forecast step (t+1 to t+5)
- Overall average metrics provided
- Error degradation analyzed over forecast horizon