# Model Evaluation
## Initalization

In [None]:
from src.evaluate import (
    evaluate_all_saved_models,
    evaluate_model,
    evaluate_specific_models,
    evaluate_manual_folders,
    evaluate_from_experiment_configs
)
from src.dataset import FER2013Dataset
from src.transforms import base_transform
from src.wandb_utils import *
import matplotlib.pyplot as plt


### Weights & Biases

In [None]:
from src.wandb_utils import login, check_wandb_mode, sync_offline_runs

# "online", "offline", or "disabled"
# If set to offlien dont forget to sink
WANDB_MODE = "offline" 

print("Initializing Weights & Biases...")
current_mode = login(
    project="emotion-classifier-vit",
    mode=WANDB_MODE
)

print(f"W&B initialized successfully in {current_mode.upper()} mode!")

In [None]:
# Weights and Biases Util Commands

# Check current mode
check_wandb_mode()

# List available offline runs
list_offline_runs()

# Uncomment and use the commands below as needed:

# Sync offline runs (when you have internet)
# sync_offline_runs()

# Change mode to online
# set_wandb_mode("online")

# Clear offline runs (use with caution!)
# clear_offline_runs(confirm=False)  # Dry run first
# clear_offline_runs(confirm=True)   # Actually delete

### Dataset

In [None]:
# Load test dataset for evaluation
test_ds = FER2013Dataset(
    split="test", 
    transform=base_transform()
)

## Evaluate All Checkpoints

In [None]:
print(f"üìä Test dataset size: {len(test_ds)}")

# Evaluate ALL saved models in checkpoints directory
print("üöÄ Evaluating all saved models...")
all_models_summary = evaluate_all_saved_models(test_ds)

if all_models_summary:
    best_model = all_models_summary[0]
    print(f"\nüèÜ Overall Best Model: {best_model['experiment']}")
    print(f"   Test Accuracy: {best_model['test_accuracy']:.4f}")
    print(f"   Run Folder: {best_model['run_folder']}")
else:
    print("‚ùå No models were successfully evaluated")

## Selective Evaluation

Evaluate specific experiments by name. The system will automatically find the latest run for each experiment.

In [None]:
# Define which experiments to evaluate
experiments_to_evaluate = [
    # Add your experiment names here
    # Example:
    # "baseline_none",
    # "baseline_light", 
    # "baseline_medium",
    # "baseline_heavy"
]

if experiments_to_evaluate:
    print(f"üéØ Evaluating {len(experiments_to_evaluate)} specific experiments...")
    specific_summary = evaluate_specific_models(experiments_to_evaluate, test_ds)
    
    if specific_summary:
        best_specific = specific_summary[0]
        print(f"\nüèÜ Best among selected: {best_specific['experiment']}")
        print(f"   Test Accuracy: {best_specific['test_accuracy']:.4f}")
else:
    print("‚ÑπÔ∏è  No experiments specified for selective evaluation")
    print("   Add experiment names to the 'experiments_to_evaluate' list above")

## Detailed Individual Model Evaluation

Evaluate specific models with detailed reporting and individual WandB runs for each evaluation.

In [None]:
# Detailed Individual Model Evaluation

# Define models to evaluate with display names
MODELS_TO_EVALUATE = {
    # "Display Name": "checkpoint_folder_name",
    # Example:
    # "Baseline None Latest": "baseline_none",
    # "Baseline Light v2": "baseline_light1",
    # "Heavy Augmentation": "baseline_heavy"
}

all_metrics = {}

if MODELS_TO_EVALUATE:
    print(f"üîç Starting detailed evaluation of {len(MODELS_TO_EVALUATE)} models...")
    
    for model_display_name, checkpoint_folder in MODELS_TO_EVALUATE.items():
        print(f"\n{'='*70}")
        print(f"Evaluating: {model_display_name}")
        print(f"Checkpoint: {checkpoint_folder}")
        print(f"{'='*70}")
        
        try:
            # Find the latest run for this experiment
            from src.evaluate import find_latest_run_for_experiment, load_model_from_checkpoint
            
            run_folder = find_latest_run_for_experiment(checkpoint_folder)
            checkpoint_path = run_folder / f"best_{run_folder.name}.pth"
            
            if not checkpoint_path.exists():
                print(f"‚ùå Checkpoint not found: {checkpoint_path}")
                continue
            
            # Load model
            model = load_model_from_checkpoint(checkpoint_path)
            
            # Initialize W&B run for this evaluation (if online)
            if get_wandb_mode() != "disabled":
                import wandb
                wandb.init(
                    project="emotion-classification-eval",
                    name=f"eval_{model_display_name}",
                    job_type="evaluation",
                    reinit=True
                )
            
            # Evaluate with detailed logging
            metrics = evaluate_model(
                model=model, 
                test_dataset=test_ds,
                log_to_wandb=(get_wandb_mode() != "disabled"),
                run_name=model_display_name
            )
            
            # Store metrics
            all_metrics[model_display_name] = metrics
            
            # Print detailed report
            from src.evaluate import print_classification_report
            print_classification_report(metrics)
            
            # Finish W&B run
            if get_wandb_mode() != "disabled" and wandb.run is not None:
                wandb.finish()
            
            print(f"‚úÖ Completed evaluation for {model_display_name}")
            
        except Exception as e:
            print(f"‚ùå Failed to evaluate {model_display_name}: {e}")
            continue
else:
    print("‚ÑπÔ∏è  No models specified for detailed evaluation")
    print("   Add models to the 'MODELS_TO_EVALUATE' dictionary above")

## Model Comparison Summary

Compare all evaluated models side-by-side.

In [None]:
# Model Comparison Summary
if all_metrics:
    print(f"\n{'='*70}")
    print("MODEL COMPARISON SUMMARY")
    print(f"{'='*70}")
    
    # Create comparison table
    comparison_data = []
    for model_name, metrics in all_metrics.items():
        comparison_data.append({
            'Model': model_name,
            'Accuracy': f"{metrics['accuracy']:.4f}",
            'Precision': f"{metrics['precision']:.4f}",
            'Recall': f"{metrics['recall']:.4f}",
            'F1-Score': f"{metrics['f1']:.4f}"
        })
    
    # Print formatted table
    header = f"{'Model':<25} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10}"
    print(header)
    print("-" * len(header))
    
    for row in comparison_data:
        print(f"{row['Model']:<25} {row['Accuracy']:<10} {row['Precision']:<10} {row['Recall']:<10} {row['F1-Score']:<10}")
    
else:
    print("No models available for comparison")

## Best Model Identification

Identify the best performing model from the evaluated set.

In [None]:
# Best Model Identification
if all_metrics:
    # Find best by F1-score (you can change this to accuracy, precision, etc.)
    best_model_name = max(all_metrics.items(), key=lambda x: x[1]['f1'])
    best_by_accuracy = max(all_metrics.items(), key=lambda x: x[1]['accuracy'])
    
    print(f"\n{'='*70}")
    print("BEST MODEL IDENTIFICATION")
    print(f"{'='*70}")
    
    print(f"\nüèÜ BEST BY F1-SCORE: {best_model_name[0]}")
    print(f"   F1-Score:  {best_model_name[1]['f1']:.4f}")
    print(f"   Accuracy:  {best_model_name[1]['accuracy']:.4f}")
    print(f"   Precision: {best_model_name[1]['precision']:.4f}")
    print(f"   Recall:    {best_model_name[1]['recall']:.4f}")
    
    print(f"\nüéØ BEST BY ACCURACY: {best_by_accuracy[0]}")
    print(f"   Accuracy:  {best_by_accuracy[1]['accuracy']:.4f}")
    print(f"   F1-Score:  {best_by_accuracy[1]['f1']:.4f}")
    print(f"   Precision: {best_by_accuracy[1]['precision']:.4f}")
    print(f"   Recall:    {best_by_accuracy[1]['recall']:.4f}")
    
    # Show performance comparison
    print(f"\nüìä PERFORMANCE RANGE:")
    all_f1 = [m['f1'] for m in all_metrics.values()]
    all_acc = [m['accuracy'] for m in all_metrics.values()]
    print(f"   F1-Score:  {min(all_f1):.4f} - {max(all_f1):.4f}")
    print(f"   Accuracy:  {min(all_acc):.4f} - {max(all_acc):.4f}")
    
    print(f"{'='*70}")

## Performance Visualization

Create visual comparisons of model performance.

In [None]:
# Performance Visualization
if all_metrics and len(all_metrics) > 1:
    print("\nüìà Creating performance visualizations...")
    
    model_names = list(all_metrics.keys())
    accuracies = [all_metrics[name]['accuracy'] for name in model_names]
    f1_scores = [all_metrics[name]['f1'] for name in model_names]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Accuracy comparison
    bars1 = ax1.bar(model_names, accuracies, color='skyblue', alpha=0.8)
    ax1.set_title('Model Accuracy Comparison')
    ax1.set_ylabel('Accuracy')
    ax1.set_ylim(0, 1)
    ax1.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar in bars1:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom')
    
    # F1-Score comparison
    bars2 = ax2.bar(model_names, f1_scores, color='lightcoral', alpha=0.8)
    ax2.set_title('Model F1-Score Comparison')
    ax2.set_ylabel('F1-Score')
    ax2.set_ylim(0, 1)
    ax2.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("‚úÖ Performance visualization saved as 'model_comparison.png'")