# Temporal Generalization Analysis

This notebook evaluates how models trained on week T generalize to future weeks (T+1, T+2, ...).

We analyze models from `exps/cesnet_v3/` to understand temporal drift and model degradation over time.

In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from temporal_generalization import (
    evaluate_temporal_generalization,
    results_to_dataframe,
    save_results,
    load_results,
    extract_week_number
)
from train_per_week_cesnet import load_label_mapping

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['font.size'] = 12

## Configuration

In [2]:
# Paths
EXPERIMENT_DIR = Path('exps/cesnet_v3')
DATASET_ROOT = Path('/home/anatbr/dataset/CESNET-TLS-Year22')
RESULTS_PATH = Path('exps/cesnet_v3_temporal_generalization_results.json')

# Evaluation parameters
BATCH_SIZE = 64
NUM_WORKERS = 8
RESOLUTION = 256
DATA_SAMPLE_FRAC = 0.1  # Use 10% of each week's data for faster evaluation
SEED = 42
DEVICE = 'cuda:0'
CHECKPOINT_NAME = 'best_model.pth'

## Load Label Mapping

In [3]:
label_indices_mapping, num_classes = load_label_mapping(DATASET_ROOT)
print(f"Number of classes: {num_classes}")

Number of classes: 180


## Run Evaluation (or Load Cached Results)

This step evaluates all models on all weeks. It may take a while, so results are cached.

In [None]:
# Check if results already exist
if RESULTS_PATH.exists():
    print(f"Loading cached results from {RESULTS_PATH}")
    results = load_results(RESULTS_PATH)
    print(f"Loaded {len(results)} evaluation results")
else:
    print("Running temporal generalization evaluation...")
    print("This may take a while. Results will be cached for future use.\n")
    
    results = evaluate_temporal_generalization(
        experiment_dir=EXPERIMENT_DIR,
        dataset_root=DATASET_ROOT,
        label_indices_mapping=label_indices_mapping,
        num_classes=num_classes,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        resolution=RESOLUTION,
        data_sample_frac=DATA_SAMPLE_FRAC,
        seed=SEED,
        device=DEVICE,
        checkpoint_name=CHECKPOINT_NAME
    )
    
    # Save results
    save_results(results, RESULTS_PATH)
    print(f"\nCompleted {len(results)} evaluations")

Running temporal generalization evaluation...
This may take a while. Results will be cached for future use.


Found 53 weeks with data: WEEK-2022-00 to WEEK-2022-52
Found 53 trained models



Evaluating models:   0%|          | 0/53 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

## Convert to DataFrame for Analysis

In [None]:
df = results_to_dataframe(results)
print(f"Total evaluations: {len(df)}")
print(f"\nTrain weeks: {sorted(df['train_week_num'].unique())}")
print(f"Test weeks: {sorted(df['test_week_num'].unique())}")
df.head(10)

## Temporal Generalization Plot

For each model trained on week T, plot its performance on all future weeks (T+1, T+2, ...).
- Validation accuracy on training week T is marked with a star (★)
- Test accuracy on week T+1 is highlighted with a thicker black line

In [None]:
def plot_temporal_generalization(df, figsize=(18, 10), y_range=(0.7, 1.0)):
    """
    Plot temporal generalization: accuracy vs week number.
    
    For each model trained on week T, show performance on all future weeks.
    """
    fig, ax = plt.subplots(figsize=figsize)
    
    # Get unique training weeks
    train_weeks = sorted(df['train_week_num'].unique())
    
    # Color map for different training weeks
    colors = plt.cm.tab20(np.linspace(0, 1, len(train_weeks)))
    
    for i, train_week in enumerate(train_weeks):
        # Filter data for this training week
        train_df = df[df['train_week_num'] == train_week].sort_values('test_week_num')
        
        if len(train_df) == 0:
            continue
        
        test_weeks = train_df['test_week_num'].values
        accuracies = train_df['accuracy'].values / 100.0  # Convert to 0-1 scale
        
        # Plot the line
        label = f'Trained on W{train_week}'
        ax.plot(test_weeks, accuracies, '-o', color=colors[i], 
                label=label, alpha=0.7, linewidth=2, markersize=4)
        
        # Mark validation accuracy on training week with a star
        train_week_idx = np.where(test_weeks == train_week)[0]
        if len(train_week_idx) > 0:
            idx = train_week_idx[0]
            ax.scatter(test_weeks[idx], accuracies[idx], 
                      marker='*', s=300, color=colors[i], 
                      edgecolors='black', linewidths=1.5, zorder=10)
        
        # Highlight test accuracy on week T+1 with black outline
        next_week_idx = np.where(test_weeks == train_week + 1)[0]
        if len(next_week_idx) > 0:
            idx = next_week_idx[0]
            ax.scatter(test_weeks[idx], accuracies[idx], 
                      marker='o', s=150, facecolors=colors[i], 
                      edgecolors='black', linewidths=2.5, zorder=9)
    
    ax.set_xlabel('Test Week Number', fontsize=14, fontweight='bold')
    ax.set_ylabel('Classification Accuracy', fontsize=14, fontweight='bold')
    ax.set_title('Temporal Generalization: Model Performance on Future Weeks\n' + 
                 '★ = Validation accuracy on training week | ● with thick border = Test accuracy on week T+1',
                 fontsize=16, fontweight='bold', pad=20)
    
    ax.set_ylim(y_range)
    ax.grid(True, alpha=0.3)
    
    # Legend
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', 
             fontsize=10, framealpha=0.9)
    
    plt.tight_layout()
    
    return fig

fig = plot_temporal_generalization(df)
plt.savefig('exps/temporal_generalization_plot.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Plot saved to exps/temporal_generalization_plot.png")

## Performance Degradation Analysis

Analyze how much performance degrades over time (weeks after training).

In [None]:
# Calculate weeks elapsed since training
df['weeks_elapsed'] = df['test_week_num'] - df['train_week_num']

# Filter only future weeks (excluding training week itself)
future_df = df[df['weeks_elapsed'] > 0].copy()

# Group by weeks elapsed and calculate statistics
degradation = future_df.groupby('weeks_elapsed')['accuracy'].agg(['mean', 'std', 'count'])
degradation = degradation.reset_index()

print("Performance degradation over time:")
print(degradation.head(10))

# Plot degradation
fig, ax = plt.subplots(figsize=(12, 6))
ax.errorbar(degradation['weeks_elapsed'], degradation['mean'], 
            yerr=degradation['std'], marker='o', capsize=5, linewidth=2)
ax.set_xlabel('Weeks After Training', fontsize=12, fontweight='bold')
ax.set_ylabel('Mean Accuracy (%)', fontsize=12, fontweight='bold')
ax.set_title('Average Performance Degradation Over Time', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('exps/performance_degradation.png', dpi=300, bbox_inches='tight')
plt.show()

## Heatmap: Train Week vs Test Week

In [None]:
# Create pivot table for heatmap
heatmap_data = df.pivot(index='train_week_num', columns='test_week_num', values='accuracy')

fig, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(heatmap_data, annot=False, fmt='.1f', cmap='RdYlGn', 
            vmin=70, vmax=100, cbar_kws={'label': 'Accuracy (%)'},
            ax=ax)
ax.set_xlabel('Test Week Number', fontsize=12, fontweight='bold')
ax.set_ylabel('Train Week Number', fontsize=12, fontweight='bold')
ax.set_title('Model Accuracy Heatmap: Training Week vs Test Week', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('exps/accuracy_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## Summary Statistics

In [None]:
# Performance on same week (validation)
same_week = df[df['train_week_num'] == df['test_week_num']]
print("Performance on same week (validation):")
print(f"  Mean: {same_week['accuracy'].mean():.2f}%")
print(f"  Std:  {same_week['accuracy'].std():.2f}%")
print(f"  Min:  {same_week['accuracy'].min():.2f}%")
print(f"  Max:  {same_week['accuracy'].max():.2f}%")

# Performance on next week (T+1)
next_week = df[df['weeks_elapsed'] == 1]
print("\nPerformance on next week (T+1):")
print(f"  Mean: {next_week['accuracy'].mean():.2f}%")
print(f"  Std:  {next_week['accuracy'].std():.2f}%")
print(f"  Min:  {next_week['accuracy'].min():.2f}%")
print(f"  Max:  {next_week['accuracy'].max():.2f}%")

# Average drop from validation to T+1
merged = same_week.merge(next_week, on='train_week_num', suffixes=('_val', '_next'))
merged['drop'] = merged['accuracy_val'] - merged['accuracy_next']
print("\nAccuracy drop from validation to T+1:")
print(f"  Mean: {merged['drop'].mean():.2f} percentage points")
print(f"  Std:  {merged['drop'].std():.2f} percentage points")

# Long-term performance (4+ weeks)
long_term = df[df['weeks_elapsed'] >= 4]
if len(long_term) > 0:
    print("\nPerformance 4+ weeks after training:")
    print(f"  Mean: {long_term['accuracy'].mean():.2f}%")
    print(f"  Std:  {long_term['accuracy'].std():.2f}%")

## Best and Worst Generalizing Models

In [None]:
# Calculate average performance on future weeks for each training week
future_performance = future_df.groupby('train_week_num')['accuracy'].agg(['mean', 'std', 'count'])
future_performance = future_performance.reset_index()
future_performance = future_performance.sort_values('mean', ascending=False)

print("Models with best average performance on future weeks:")
print(future_performance.head(5))

print("\nModels with worst average performance on future weeks:")
print(future_performance.tail(5))