# Model Evaluation and Comparison

This notebook evaluates multiple trained models and compares them against a baseline.


In [62]:
from evaluate import evaluate
import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Configuration

Set up the models you want to evaluate.

In [63]:
# Configuration
DATA_PATH = "data/"
ASSETS_PATH = "assets/"

# List of model names to evaluate (these should exist in assets/ directory)
# first one will be the baseline
MODEL_NAMES = [
    "logistic_regression_v2_1",
]

## Run Evaluation


In [64]:
# Run evaluation
results_df = evaluate(DATA_PATH, MODEL_NAMES)

EVALUATION SETUP
✗ GPU not available, using CPU
✓ Device: cpu

Evaluating model: logistic_regression_v2_1
Loaded label mapping from assets/label_mapping.csv
Created training dataloader with 2416 samples
Created validation dataloader with 658 samples
Created test dataloader with 345 samples
Loading test data with preprocessor for logistic_regression_v2_1...




Loaded 345 test samples
  Top-1 Accuracy: 91.30%
  Top-3 Accuracy: 97.68%
  Loss: 0.3393


## Pretty Print Results with Baseline Comparison

In [65]:
def pretty_print_results(results_df, decimal_places=3):
    """
    Pretty print evaluation results with models as columns for easy comparison.
    
    Args:
        results_df: DataFrame with evaluation results
        decimal_places: Number of decimal places for rounding
    """
    baseline_model_name = results_df.iloc[0]['model_name']
    num_models = len(results_df)
    
    # Get numeric columns (excluding model_name)
    numeric_cols = [col for col in results_df.columns if col != 'model_name']
    
    # Get list of metric categories
    base_metrics = ['top1_accuracy', 'top3_accuracy', 'loss']
    class_cols = [col for col in numeric_cols if col not in base_metrics]
    
    # Build base metrics DataFrame
    base_data = []
    for metric in base_metrics:
        row = {'Metric': metric}
        for idx, model_row in results_df.iterrows():
            model_name = model_row['model_name']
            value = model_row[metric]
            baseline_val = results_df[results_df['model_name'] == baseline_model_name].iloc[0][metric]
            
            if model_name != baseline_model_name:
                diff = value - baseline_val
                if 'loss' in metric:
                    diff_percent = abs(diff / baseline_val * 100) if baseline_val != 0 else 0
                    sign = "↓" if diff < 0 else "↑"
                    formatted = f"{value:.{decimal_places}f} {sign}{diff_percent:.{decimal_places}f}%"
                else:
                    diff_percent = (diff / baseline_val * 100) if baseline_val != 0 else 0
                    sign = "↓" if diff < 0 else "↑"
                    formatted = f"{value:.{decimal_places}f}% {sign}{diff_percent:.{decimal_places}f}%"
            else:
                if 'loss' in metric:
                    formatted = f"{value:.{decimal_places}f}"
                else:
                    formatted = f"{value:.{decimal_places}f}%"
            row[model_name] = formatted
        base_data.append(row)
    
    base_df = pd.DataFrame(base_data)
    base_df = base_df.set_index('Metric')
    
    # Build per-class metrics DataFrame
    class_data = []
    processed_combos = set()
    
    for col in class_cols:
        if 'precision' in col:
            class_name = col.replace('precision_', '')
            metric_type = 'precision'
        elif 'recall' in col:
            class_name = col.replace('recall_', '')
            metric_type = 'recall'
        else:
            continue
        
        combo = (class_name, metric_type)
        if combo in processed_combos:
            continue
        processed_combos.add(combo)
        
        row = {'Class': class_name, 'Metric': metric_type}
        metric_col = f"{metric_type}_{class_name}"
        
        if metric_col not in results_df.columns:
            continue
        
        for idx, model_row in results_df.iterrows():
            model_name = model_row['model_name']
            value = model_row[metric_col]
            baseline_val = results_df[results_df['model_name'] == baseline_model_name].iloc[0][metric_col]
            
            if model_name != baseline_model_name:
                diff = value - baseline_val
                diff_percent = (diff / baseline_val * 100) if baseline_val != 0 else 0
                sign = "↓" if diff < 0 else "↑"
                formatted = f"{value:.{decimal_places}f}% {sign}{diff_percent:.{decimal_places}f}%"
            else:
                formatted = f"{value:.{decimal_places}f}%"
            
            row[model_name] = formatted
        class_data.append(row)
    
    class_df = pd.DataFrame(class_data)
    
    # Display results
    print("\n" + "=" * 120)
    print("EVALUATION RESULTS - SIDE BY SIDE COMPARISON")
    print("=" * 120)
    print(f"Baseline: {baseline_model_name}")
    print("=" * 120 + "\n")
    
    print("BASE METRICS:")
    print("-" * 120)
    print(base_df.to_string())
    print()
    
    if len(class_data) > 0:
        print("PER-CLASS METRICS:")
        print("-" * 120)
        print(class_df.to_string(index=False))
        print()
    
    print("=" * 120)

In [66]:
# Display pretty results
pretty_print_results(results_df)


EVALUATION RESULTS - SIDE BY SIDE COMPARISON
Baseline: logistic_regression_v2_1

BASE METRICS:
------------------------------------------------------------------------------------------------------------------------
              logistic_regression_v2_1
Metric                                
top1_accuracy                  91.304%
top3_accuracy                  97.681%
loss                             0.339

PER-CLASS METRICS:
------------------------------------------------------------------------------------------------------------------------
     Class    Metric logistic_regression_v2_1
  asteroid precision                  86.207%
  asteroid    recall                  86.207%
black_hole precision                  80.556%
black_hole    recall                  93.548%
     earth precision                  89.655%
     earth    recall                  89.655%
    galaxy precision                  87.879%
    galaxy    recall                  96.667%
   jupiter precision             