# Model Evaluation and Comparison

This notebook evaluates multiple trained models and compares them against a baseline.


In [20]:
from evaluate import evaluate
import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Configuration

Set up the models you want to evaluate.

In [21]:
# Configuration
DATA_PATH = "data/"
ASSETS_PATH = "assets/"

# List of model names to evaluate (these should exist in assets/ directory)
# first one will be the baseline
MODEL_NAMES = [
    "logistic_regression_baseline",
]

## Run Evaluation


In [22]:
# Run evaluation
results_df = evaluate(DATA_PATH, MODEL_NAMES)

EVALUATION SETUP
✗ GPU not available, using CPU
✓ Device: cpu
Loaded label mapping from assets/label_mapping.csv
Created training dataloader with 2416 samples
Created validation dataloader with 658 samples
Created test dataloader with 345 samples
Loading test data...
Loaded 345 test samples

Evaluating model: logistic_regression_baseline
  Top-1 Accuracy: 84.64%
  Top-3 Accuracy: 96.81%
  Loss: 0.5254


## Pretty Print Results with Baseline Comparison

In [24]:
def pretty_print_results(results_df, decimal_places=3):
    """
    Pretty print evaluation results with baseline comparison.
    
    Args:
        results_df: DataFrame with evaluation results
        decimal_places: Number of decimal places for rounding
    """
    # Find baseline row
    baseline_metrics = results_df.iloc[0]

    baseline_name = baseline_metrics['model_name']
    
    # Get numeric columns (excluding model_name)
    numeric_cols = [col for col in results_df.columns if col != 'model_name']
    
    print("=" * 100)
    print("EVALUATION RESULTS WITH BASELINE COMPARISON")
    print("=" * 100)
    print(f"\nBaseline Model: {baseline_name}")
    print("-" * 100)
    
    # Display each model's results
    for idx, row in results_df.iterrows():
        model_name = row['model_name']
        is_baseline = model_name == baseline_name
        
        print(f"\n{'> BASELINE <' if is_baseline else 'Model'}: {model_name}")
        print("-" * 100)
        
        for col in numeric_cols:
            value = row[col]
            
            # Skip loss and accuracy columns for percent difference calculation
            if 'loss' in col.lower():
                print(f"  {col:40s}: {value:.{decimal_places}f}")
            elif 'accuracy' in col.lower() or 'precision' in col.lower() or 'recall' in col.lower():
                baseline_value = baseline_metrics[col]
                
                if is_baseline:
                    print(f"  {col:40s}: {value:.{decimal_places}f}%")
                else:
                    diff = value - baseline_value
                    diff_percent = (diff / baseline_value * 100) if baseline_value != 0 else 0
                    sign = "+" if diff >= 0 else ""
                    print(f"  {col:40s}: {value:.{decimal_places}f}% ({sign}{diff_percent:.{decimal_places}f}% vs baseline)")
            else:
                print(f"  {col:40s}: {value:.{decimal_places}f}")
    
    print("\n" + "=" * 100)

In [25]:
# Display pretty results
pretty_print_results(results_df)

EVALUATION RESULTS WITH BASELINE COMPARISON

Baseline Model: logistic_regression_baseline
----------------------------------------------------------------------------------------------------

> BASELINE <: logistic_regression_baseline
----------------------------------------------------------------------------------------------------
  top1_accuracy                           : 84.638%
  top3_accuracy                           : 96.812%
  loss                                    : 0.525
  precision_asteroid                      : 75.862%
  recall_asteroid                         : 75.862%
  precision_black_hole                    : 70.588%
  recall_black_hole                       : 77.419%
  precision_earth                         : 83.333%
  recall_earth                            : 86.207%
  precision_galaxy                        : 84.848%
  recall_galaxy                           : 93.333%
  precision_jupiter                       : 65.517%
  recall_jupiter                          