# Model Evaluation and Comparison

This notebook evaluates multiple trained models and compares them against a baseline.


In [None]:
from evaluate import evaluate
import pandas as pd

## Configuration

Set up the models you want to evaluate.

In [None]:
# Configuration
DATA_PATH = "data/"
ASSETS_PATH = "assets/"

# List of model names to evaluate (these should exist in assets/ directory)
# first one will be the baseline
MODEL_NAMES = [
    "logistic_regression_baseline",
]

## Run Evaluation


In [None]:
# Run evaluation
results_df = evaluate(DATA_PATH, MODEL_NAMES)

## Pretty Print Results with Baseline Comparison

In [None]:
def pretty_print_results(results_df, baseline_name, decimal_places=3):
    """
    Pretty print evaluation results with baseline comparison.
    
    Args:
        results_df: DataFrame with evaluation results
        baseline_name: Name of the baseline model
        decimal_places: Number of decimal places for rounding
    """
    # Find baseline row
    baseline_row = results_df[results_df['model_name'] == baseline_name]
    
    if len(baseline_row) == 0:
        print(f"Error: Baseline model '{baseline_name}' not found in results")
        return
    
    baseline_metrics = baseline_row.iloc[0]
    
    # Get numeric columns (excluding model_name)
    numeric_cols = [col for col in results_df.columns if col != 'model_name']
    
    print("=" * 100)
    print("EVALUATION RESULTS WITH BASELINE COMPARISON")
    print("=" * 100)
    print(f"\nBaseline Model: {baseline_name}")
    print("-" * 100)
    
    # Display each model's results
    for idx, row in results_df.iterrows():
        model_name = row['model_name']
        is_baseline = model_name == baseline_name
        
        print(f"\n{'> BASELINE <' if is_baseline else 'Model'}: {model_name}")
        print("-" * 100)
        
        for col in numeric_cols:
            value = row[col]
            
            # Skip loss and accuracy columns for percent difference calculation
            if 'loss' in col.lower():
                print(f"  {col:40s}: {value:.{decimal_places}f}")
            elif 'accuracy' in col.lower() or 'precision' in col.lower() or 'recall' in col.lower():
                baseline_value = baseline_metrics[col]
                
                if is_baseline:
                    print(f"  {col:40s}: {value:.{decimal_places}f}%")
                else:
                    diff = value - baseline_value
                    diff_percent = (diff / baseline_value * 100) if baseline_value != 0 else 0
                    sign = "+" if diff >= 0 else ""
                    print(f"  {col:40s}: {value:.{decimal_places}f}% ({sign}{diff_percent:.{decimal_places}f}% vs baseline)")
            else:
                print(f"  {col:40s}: {value:.{decimal_places}f}")
    
    print("\n" + "=" * 100)

In [None]:
# Display pretty results
pretty_print_results(results_df, BASELINE_NAME)