# Evaluation and Model Comparison

This notebook provides comprehensive evaluation and comparison between baseline and augmented models.

## Objectives:
- Load both baseline and augmented models
- Perform detailed evaluation on test set
- Compare performance metrics
- Generate side-by-side result comparisons
- Provide recommendations and insights

In [1]:
import os
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import yaml
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Add src to path
sys.path.append('../src')
from model_architecture import create_model
from data_preprocessing import DataPreprocessor
from evaluation import ColorizationEvaluator, evaluate_single_image
from utils import compare_images_side_by_side, create_model_summary

print("Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

Libraries imported successfully!
PyTorch version: 2.8.0+cpu
CUDA available: False


## 1. Setup and Load Configurations

In [2]:
# Load configuration
config_path = "../config/config.yaml"
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Setup directories
processed_dir = "../data/processed"
test_dir = os.path.join(processed_dir, "test")
val_dir = os.path.join(processed_dir, "val")

baseline_model_dir = "../models/baseline_model"
augmented_model_dir = "../models/augmented_model"
comparison_results_dir = "../results/comparisons"

# Create results directory
os.makedirs(comparison_results_dir, exist_ok=True)

print(f"Comparison results will be saved to: {comparison_results_dir}")

# Check available models
baseline_model_path = os.path.join(baseline_model_dir, "baseline_model_complete.pth")
augmented_model_path = os.path.join(augmented_model_dir, "augmented_model_complete.pth")

baseline_available = os.path.exists(baseline_model_path)
augmented_available = os.path.exists(augmented_model_path)

print(f"\nModel Availability:")
print(f"Baseline model: {'✅ Available' if baseline_available else '❌ Not found'}")
print(f"Augmented model: {'✅ Available' if augmented_available else '❌ Not found'}")

if not baseline_available and not augmented_available:
    print("\n❌ No trained models found!")
    print("Please train models using notebooks 04 and 05 first.")
    raise SystemExit("Trained models required")

# Check evaluation data
def count_images(directory):
    if not os.path.exists(directory):
        return 0
    return len([f for f in os.listdir(directory) 
               if f.lower().endswith(('.png', '.jpg', '.jpeg'))])

test_count = count_images(test_dir)
val_count = count_images(val_dir)

print(f"\nEvaluation Data:")
print(f"Test images: {test_count}")
print(f"Validation images: {val_count}")

# Use test set if available, otherwise use validation set
eval_dir = test_dir if test_count > 0 else val_dir
eval_count = test_count if test_count > 0 else val_count
eval_set_name = "test" if test_count > 0 else "validation"

print(f"\nUsing {eval_set_name} set for evaluation ({eval_count} images)")

Comparison results will be saved to: ../results/comparisons

Model Availability:
Baseline model: ✅ Available
Augmented model: ✅ Available

Evaluation Data:
Test images: 9
Validation images: 9

Using test set for evaluation (9 images)


## 2. Load Training Histories and Previous Results

In [4]:
# Load training histories
baseline_history = None
augmented_history = None

baseline_history_path = "../results/baseline/training_history.yaml"
augmented_history_path = "../results/augmented/training_history.yaml"

if os.path.exists(baseline_history_path):
    with open(baseline_history_path, 'r') as f:
        baseline_history = yaml.safe_load(f)
    print(f"✅ Loaded baseline training history")
    print(f"   Best validation loss: {baseline_history['best_val_loss']:.4f}")
    print(f"   Training epochs: {baseline_history['total_epochs']}")
    print(f"   Duration: {baseline_history['training_duration_seconds']/60:.1f} min")

if os.path.exists(augmented_history_path):
    with open(augmented_history_path, 'r') as f:
        augmented_history = yaml.safe_load(f)
    print(f"\n✅ Loaded augmented training history")
    print(f"   Best validation loss: {augmented_history['best_val_loss']:.4f}")
    print(f"   Training epochs: {augmented_history['total_epochs']}")
    print(f"   Duration: {augmented_history['training_duration_seconds']/60:.1f} min")
    print(f"   Augmentation strategy: {augmented_history.get('augmentation_strategy', 'N/A')}")

# Load quick evaluation results if available
baseline_eval = None
augmented_eval = None

baseline_eval_path = "../results/baseline/quick_evaluation.yaml"
augmented_eval_path = "../results/augmented/quick_evaluation.yaml"

if os.path.exists(baseline_eval_path):
    with open(baseline_eval_path, 'r') as f:
        baseline_eval = yaml.unsafe_load(f)
    print(f"\n📊 Baseline quick evaluation available")
    
if os.path.exists(augmented_eval_path):
    with open(augmented_eval_path, 'r') as f:
        augmented_eval = yaml.unsafe_load(f)
    print(f"📊 Augmented quick evaluation available")

print(f"\n🎯 Ready for comprehensive model comparison!")

✅ Loaded baseline training history
   Best validation loss: 0.4998
   Training epochs: 11
   Duration: 17.9 min

✅ Loaded augmented training history
   Best validation loss: 0.2086
   Training epochs: 11
   Duration: 38.6 min
   Augmentation strategy: medium

📊 Baseline quick evaluation available
📊 Augmented quick evaluation available

🎯 Ready for comprehensive model comparison!


## 3. Load and Initialize Models

In [5]:
# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load baseline model
baseline_model = None
if baseline_available:
    print("\nLoading baseline model...")
    try:
        baseline_model = create_model(config_path)
        baseline_checkpoint = torch.load(baseline_model_path, map_location=device)
        baseline_model.load_state_dict(baseline_checkpoint['model_state_dict'])
        baseline_model = baseline_model.to(device)
        baseline_model.eval()
        print(f"✅ Baseline model loaded successfully")
        print(f"   Parameters: {baseline_model.count_parameters():,}")
        print(f"   Architecture: {baseline_checkpoint.get('model_architecture', 'Unknown')}")
    except Exception as e:
        print(f"❌ Error loading baseline model: {e}")
        baseline_model = None
        baseline_available = False

# Load augmented model
augmented_model = None
if augmented_available:
    print("\nLoading augmented model...")
    try:
        augmented_model = create_model(config_path)
        augmented_checkpoint = torch.load(augmented_model_path, map_location=device)
        augmented_model.load_state_dict(augmented_checkpoint['model_state_dict'])
        augmented_model = augmented_model.to(device)
        augmented_model.eval()
        print(f"✅ Augmented model loaded successfully")
        print(f"   Parameters: {augmented_model.count_parameters():,}")
        print(f"   Architecture: {augmented_checkpoint.get('model_architecture', 'Unknown')}")
        print(f"   Augmentation: {augmented_checkpoint.get('augmentation_strategy', 'Unknown')}")
    except Exception as e:
        print(f"❌ Error loading augmented model: {e}")
        augmented_model = None
        augmented_available = False

# Check if we can proceed with comparison
can_compare = baseline_available and augmented_available
print(f"\n{'✅ Both models loaded - comparison possible!' if can_compare else '⚠️ Only one model available - limited evaluation'}")

Using device: cpu

Loading baseline model...


INFO:model_architecture:Initialized unet model
INFO:model_architecture:Model created with 31,036,546 trainable parameters
INFO:model_architecture:Model size: 118.44 MB


✅ Baseline model loaded successfully
   Parameters: 31,036,546
   Architecture: unet

Loading augmented model...


INFO:model_architecture:Initialized unet model
INFO:model_architecture:Model created with 31,036,546 trainable parameters
INFO:model_architecture:Model size: 118.44 MB


✅ Augmented model loaded successfully
   Parameters: 31,036,546
   Architecture: unet
   Augmentation: medium

✅ Both models loaded - comparison possible!


## 4. Create Evaluation Data Loader

In [13]:
# Create data loader for evaluation
print("Creating evaluation data loader...")

try:
    preprocessor = DataPreprocessor(config_path)
    
    if eval_count > 0:
        # Create single data loader for the evaluation set
        from torch.utils.data import DataLoader
        from data_preprocessing import ColorDataset
        
        eval_dataset = ColorDataset(
            eval_dir,
            image_size=tuple(config['data']['input_size'])
        )
        
        eval_loader = DataLoader(
            eval_dataset,
            batch_size=config['data']['batch_size'],
            shuffle=False,  # Don't shuffle for consistent evaluation
            num_workers=config['data']['num_workers'],
            pin_memory=True
        )
        
        print(f"✅ Evaluation data loader created")
        print(f"   Dataset: {eval_set_name} set")
        print(f"   Images: {len(eval_dataset)}")
        print(f"   Batches: {len(eval_loader)}")
        
        # Test data loading
        for L, AB, filenames in eval_loader:
            print(f"   Batch shape: L{L.shape}, AB{AB.shape}")
            break
            
    else:
        print(f"❌ No evaluation data available")
        eval_loader = None
        
except Exception as e:
    print(f"❌ Error creating evaluation data loader: {e}")
    eval_loader = None

INFO:data_preprocessing:Found 9 images in ../data/processed\test


Creating evaluation data loader...
✅ Evaluation data loader created
   Dataset: test set
   Images: 9
   Batches: 1
   Batch shape: Ltorch.Size([9, 1, 256, 256]), ABtorch.Size([9, 2, 256, 256])


## 5. Comprehensive Model Evaluation

In [None]:
# Comprehensive evaluation of both models
if eval_loader is not None:
    print("Performing comprehensive model evaluation...")
    print("This may take several minutes...")
    
    evaluator = ColorizationEvaluator(config_path)
    
    baseline_results = None
    augmented_results = None
    
    # Evaluate baseline model
    if baseline_model is not None:
        print(f"\n📊 Evaluating baseline model...")
        try:
            baseline_results = evaluator.evaluate_model(
                baseline_model,
                eval_loader,
                save_results=False
            )
            print(f"✅ Baseline evaluation completed")
        except Exception as e:
            print(f"❌ Error evaluating baseline model: {e}")
    
    # Evaluate augmented model
    if augmented_model is not None:
        print(f"\n📊 Evaluating augmented model...")
        try:
            augmented_results = evaluator.evaluate_model(
                augmented_model,
                eval_loader,
                save_results=False
            )
            print(f"✅ Augmented evaluation completed")
        except Exception as e:
            print(f"❌ Error evaluating augmented model: {e}")
    
    print(f"\n🎉 Comprehensive evaluation completed!")

else:
    print("⚠️ Skipping comprehensive evaluation - no evaluation data available")
    baseline_results = None
    augmented_results = None

INFO:evaluation:Starting model evaluation...


Performing comprehensive model evaluation...
This may take several minutes...

📊 Evaluating baseline model...


## 6. Generate Detailed Comparison Report

In [12]:
# Generate detailed comparison report
if baseline_results and augmented_results:
    print("Generating detailed comparison report...")
    
    # Create comprehensive comparison
    comparison_data = {
        'evaluation_date': datetime.now().isoformat(),
        'evaluation_dataset': eval_set_name,
        'samples_evaluated': eval_count,
        'models_compared': ['baseline', 'augmented'],
        
        'baseline_model': {
            'training_augmentation': False,
            'best_training_val_loss': baseline_history['best_val_loss'] if baseline_history else None,
            'training_epochs': baseline_history['total_epochs'] if baseline_history else None,
            'evaluation_metrics': baseline_results
        },
        
        'augmented_model': {
            'training_augmentation': True,
            'augmentation_strategy': augmented_history.get('augmentation_strategy') if augmented_history else None,
            'best_training_val_loss': augmented_history['best_val_loss'] if augmented_history else None,
            'training_epochs': augmented_history['total_epochs'] if augmented_history else None,
            'evaluation_metrics': augmented_results
        }
    }
    
    # Calculate improvements
    improvements = {}
    for metric in ['psnr_mean', 'ssim_mean', 'mse_mean', 'mae_mean']:
        if metric in baseline_results and metric in augmented_results:
            baseline_val = baseline_results[metric]
            augmented_val = augmented_results[metric]
            
            if metric in ['psnr_mean', 'ssim_mean']:  # Higher is better
                improvement = augmented_val - baseline_val
                improvement_pct = (improvement / baseline_val) * 100
            else:  # Lower is better (mse, mae)
                improvement = baseline_val - augmented_val
                improvement_pct = (improvement / baseline_val) * 100
            
            improvements[metric] = {
                'absolute_improvement': float(improvement),
                'percentage_improvement': float(improvement_pct),
                'baseline_value': float(baseline_val),
                'augmented_value': float(augmented_val)
            }
    
    comparison_data['improvements'] = improvements
    
    # Overall assessment
    significant_improvements = sum(1 for imp in improvements.values() if abs(imp['percentage_improvement']) > 1)
    positive_improvements = sum(1 for imp in improvements.values() if imp['percentage_improvement'] > 0)
    
    if positive_improvements >= 3:  # Most metrics improved
        overall_result = "significant_improvement"
    elif positive_improvements >= 2:
        overall_result = "moderate_improvement"
    elif positive_improvements >= 1:
        overall_result = "slight_improvement"
    else:
        overall_result = "no_improvement"
    
    comparison_data['overall_result'] = overall_result
    
    # Save comparison report
    comparison_path = os.path.join(comparison_results_dir, 'detailed_comparison.yaml')
    with open(comparison_path, 'w') as f:
        yaml.dump(comparison_data, f, default_flow_style=False)
    
    print(f"✅ Detailed comparison report saved to: {comparison_path}")
    
    # Display key results
    print(f"\n📈 DETAILED COMPARISON RESULTS")
    print(f"=" * 50)
    print(f"Overall Result: {overall_result.replace('_', ' ').title()}")
    print(f"Metrics with positive improvement: {positive_improvements}/4")
    
    print(f"\n📊 Metric-by-Metric Comparison:")
    for metric, data in improvements.items():
        metric_name = metric.replace('_mean', '').upper()
        status = "✅" if data['percentage_improvement'] > 1 else "➖" if data['percentage_improvement'] > -1 else "❌"
        print(f"  {metric_name:<4}: {data['percentage_improvement']:+.2f}% improvement {status}")

else:
    print("⚠️ Skipping detailed comparison - evaluation results not available")

⚠️ Skipping detailed comparison - evaluation results not available


## 7. Visualize Comparison Results

In [None]:
# Create comparison visualizations
if baseline_results and augmented_results:
    print("Creating comparison visualizations...")
    
    # Metrics comparison bar chart
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    metrics_to_plot = ['psnr_mean', 'ssim_mean', 'mse_mean', 'mae_mean']
    metric_names = ['PSNR (dB)', 'SSIM', 'MSE', 'MAE']
    
    for i, (metric, name) in enumerate(zip(metrics_to_plot, metric_names)):
        row, col = i // 2, i % 2
        
        if metric in baseline_results and metric in augmented_results:
            baseline_val = baseline_results[metric]
            augmented_val = augmented_results[metric]
            
            values = [baseline_val, augmented_val]
            labels = ['Baseline\n(No Augmentation)', 'Augmented\n(With Augmentation)']
            colors = ['lightblue', 'lightcoral']
            
            bars = axes[row, col].bar(labels, values, color=colors)
            axes[row, col].set_title(f'{name} Comparison')
            axes[row, col].set_ylabel(name)
            
            # Add value labels on bars
            for bar, value in zip(bars, values):
                height = bar.get_height()
                axes[row, col].text(bar.get_x() + bar.get_width()/2., height,
                                   f'{value:.4f}', ha='center', va='bottom')
            
            # Add improvement annotation
            if metric in improvements:
                imp_pct = improvements[metric]['percentage_improvement']
                color = 'green' if imp_pct > 0 else 'red'
                axes[row, col].text(0.5, max(values) * 1.1, f'{imp_pct:+.2f}% change',
                                   ha='center', va='center', color=color, weight='bold',
                                   transform=axes[row, col].transData)
    
    plt.tight_layout()
    
    # Save metrics comparison
    metrics_comparison_path = os.path.join(comparison_results_dir, 'metrics_comparison.png')
    plt.savefig(metrics_comparison_path, dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"✅ Metrics comparison saved to: {metrics_comparison_path}")
    
    # Create improvement radar chart
    if len(improvements) >= 3:
        print("\nCreating improvement radar chart...")
        
        fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(projection='polar'))
        
        # Prepare data for radar chart
        metrics = list(improvements.keys())
        values = [improvements[m]['percentage_improvement'] for m in metrics]
        
        # Add first value at the end to close the circle
        values += values[:1]
        
        # Calculate angles for each metric
        angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
        angles += angles[:1]
        
        # Plot
        ax.plot(angles, values, 'o-', linewidth=2, label='Improvement %')
        ax.fill(angles, values, alpha=0.25)
        
        # Add labels
        metric_labels = [m.replace('_mean', '').upper() for m in metrics]
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(metric_labels)
        
        # Add grid lines at key improvement levels
        ax.set_ylim(-10, 10)
        ax.set_yticks([-5, 0, 5])
        ax.set_yticklabels(['-5%', '0%', '+5%'])
        ax.grid(True)
        
        # Add zero line
        ax.axhline(y=0, color='black', linestyle='-', alpha=0.5)
        
        plt.title('Improvement Percentage by Metric\n(Augmented vs Baseline)', 
                 size=16, weight='bold', pad=20)
        
        # Save radar chart
        radar_path = os.path.join(comparison_results_dir, 'improvement_radar.png')
        plt.savefig(radar_path, dpi=150, bbox_inches='tight')
        plt.show()
        
        print(f"✅ Improvement radar chart saved to: {radar_path}")

else:
    print("⚠️ Skipping visualizations - comparison results not available")

## 8. Side-by-Side Result Comparison

In [None]:
# Generate side-by-side colorization results
if baseline_model and augmented_model and eval_loader:
    print("Generating side-by-side colorization comparisons...")
    
    try:
        # Get sample images for comparison
        sample_results = []
        max_samples = 6
        
        with torch.no_grad():
            for batch_idx, (L, AB, filenames) in enumerate(eval_loader):
                if len(sample_results) >= max_samples:
                    break
                
                L_gpu = L.to(device)
                
                # Get predictions from both models
                baseline_pred = baseline_model(L_gpu).cpu()
                augmented_pred = augmented_model(L_gpu).cpu()
                
                # Convert to RGB
                baseline_rgb = evaluator.lab_to_rgb(L, baseline_pred)
                augmented_rgb = evaluator.lab_to_rgb(L, augmented_pred)
                target_rgb = evaluator.lab_to_rgb(L, AB)
                
                # Store results
                for i in range(min(2, len(filenames))):
                    if len(sample_results) >= max_samples:
                        break
                        
                    sample_results.append({
                        'filename': filenames[i],
                        'input_L': (L[i, 0].numpy() + 1) / 2,  # Denormalize for display
                        'baseline_pred': baseline_rgb[i],
                        'augmented_pred': augmented_rgb[i],
                        'target': target_rgb[i]
                    })
        
        # Create comparison visualization
        if sample_results:
            num_samples = len(sample_results)
            fig, axes = plt.subplots(4, num_samples, figsize=(4*num_samples, 16))
            
            if num_samples == 1:
                axes = axes.reshape(4, 1)
            
            for i, result in enumerate(sample_results):
                # Input (grayscale)
                axes[0, i].imshow(result['input_L'], cmap='gray')
                axes[0, i].set_title(f'Input (Grayscale)\n{result["filename"][:20]}...', fontsize=10)
                axes[0, i].axis('off')
                
                # Baseline prediction
                axes[1, i].imshow(result['baseline_pred'])
                axes[1, i].set_title('Baseline Prediction\n(No Augmentation)', fontsize=10)
                axes[1, i].axis('off')
                
                # Augmented prediction
                axes[2, i].imshow(result['augmented_pred'])
                axes[2, i].set_title('Augmented Prediction\n(With Augmentation)', fontsize=10)
                axes[2, i].axis('off')
                
                # Ground truth
                axes[3, i].imshow(result['target'])
                axes[3, i].set_title('Ground Truth', fontsize=10)
                axes[3, i].axis('off')
            
            plt.tight_layout()
            
            # Save side-by-side comparison
            sidebyside_path = os.path.join(comparison_results_dir, 'side_by_side_comparison.png')
            plt.savefig(sidebyside_path, dpi=150, bbox_inches='tight')
            plt.show()
            
            print(f"✅ Side-by-side comparison saved to: {sidebyside_path}")
            
            # Calculate per-image metrics for these samples
            print(f"\n📊 Per-Sample Comparison:")
            for i, result in enumerate(sample_results):
                baseline_psnr = evaluator.calculate_psnr(result['baseline_pred'], result['target'])
                augmented_psnr = evaluator.calculate_psnr(result['augmented_pred'], result['target'])
                
                baseline_ssim = evaluator.calculate_ssim(result['baseline_pred'], result['target'])
                augmented_ssim = evaluator.calculate_ssim(result['augmented_pred'], result['target'])
                
                psnr_diff = augmented_psnr - baseline_psnr
                ssim_diff = augmented_ssim - baseline_ssim
                
                print(f"  Sample {i+1}: PSNR {psnr_diff:+.2f} dB, SSIM {ssim_diff:+.4f}")
        
        else:
            print("No sample results generated")
            
    except Exception as e:
        print(f"❌ Error generating side-by-side comparison: {e}")

else:
    print("⚠️ Skipping side-by-side comparison - models or data not available")

## 9. Training Progress Comparison

In [1]:
# Compare training progress between models
if baseline_history and augmented_history:
    print("Creating training progress comparison...")
    
    try:
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Training loss comparison
        baseline_epochs = range(1, len(baseline_history['train_losses']) + 1)
        augmented_epochs = range(1, len(augmented_history['train_losses']) + 1)
        
        axes[0].plot(baseline_epochs, baseline_history['train_losses'], 
                    'b-', label='Baseline (No Aug)', linewidth=2, alpha=0.8)
        axes[0].plot(augmented_epochs, augmented_history['train_losses'], 
                    'r-', label='Augmented (With Aug)', linewidth=2, alpha=0.8)
        
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Training Loss')
        axes[0].set_title('Training Loss Comparison')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Validation loss comparison
        axes[1].plot(baseline_epochs, baseline_history['val_losses'], 
                    'b-', label='Baseline (No Aug)', linewidth=2, alpha=0.8)
        axes[1].plot(augmented_epochs, augmented_history['val_losses'], 
                    'r-', label='Augmented (With Aug)', linewidth=2, alpha=0.8)
        
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Validation Loss')
        axes[1].set_title('Validation Loss Comparison')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
        
        # Add best loss markers
        baseline_best_epoch = np.argmin(baseline_history['val_losses']) + 1
        augmented_best_epoch = np.argmin(augmented_history['val_losses']) + 1
        
        axes[1].scatter(baseline_best_epoch, baseline_history['best_val_loss'], 
                       color='blue', s=100, zorder=5, marker='*')
        axes[1].scatter(augmented_best_epoch, augmented_history['best_val_loss'], 
                       color='red', s=100, zorder=5, marker='*')
        
        # Annotations
        axes[1].annotate(f'Baseline Best\n{baseline_history["best_val_loss"]:.4f}',
                        xy=(baseline_best_epoch, baseline_history['best_val_loss']),
                        xytext=(10, 10), textcoords='offset points', fontsize=9,
                        bbox=dict(boxstyle='round,pad=0.3', facecolor='lightblue', alpha=0.8))
        
        axes[1].annotate(f'Augmented Best\n{augmented_history["best_val_loss"]:.4f}',
                        xy=(augmented_best_epoch, augmented_history['best_val_loss']),
                        xytext=(10, -30), textcoords='offset points', fontsize=9,
                        bbox=dict(boxstyle='round,pad=0.3', facecolor='lightcoral', alpha=0.8))
        
        plt.tight_layout()
        
        # Save training comparison
        training_comparison_path = os.path.join(comparison_results_dir, 'training_progress_comparison.png')
        plt.savefig(training_comparison_path, dpi=150, bbox_inches='tight')
        plt.show()
        
        print(f"✅ Training progress comparison saved to: {training_comparison_path}")
        
        # Training statistics comparison
        print(f"\n📊 Training Statistics Comparison:")
        print(f"" + "=" * 50)
        
        training_stats = pd.DataFrame({
            'Metric': [
                'Training Duration (min)',
                'Total Epochs',
                'Best Validation Loss',
                'Final Training Loss',
                'Final Validation Loss'
            ],
            'Baseline': [
                f"{baseline_history['training_duration_seconds']/60:.1f}",
                baseline_history['total_epochs'],
                f"{baseline_history['best_val_loss']:.4f}",
                f"{baseline_history['final_train_loss']:.4f}",
                f"{baseline_history['final_val_loss']:.4f}"
            ],
            'Augmented': [
                f"{augmented_history['training_duration_seconds']/60:.1f}",
                augmented_history['total_epochs'],
                f"{augmented_history['best_val_loss']:.4f}",
                f"{augmented_history['final_train_loss']:.4f}",
                f"{augmented_history['final_val_loss']:.4f}"
            ]
        })
        
        print(training_stats.to_string(index=False))
        
    except Exception as e:
        print(f"❌ Error creating training progress comparison: {e}")

else:
    print("⚠️ Training histories not available for comparison")

NameError: name 'baseline_history' is not defined

## 10. Final Report and Recommendations

In [2]:
# Generate final comprehensive report
print("FINAL EVALUATION REPORT")
print("=" * 60)

# Model availability summary
print(f"Models Evaluated:")
print(f"  Baseline Model: {'✅ Available' if baseline_available else '❌ Not Available'}")
print(f"  Augmented Model: {'✅ Available' if augmented_available else '❌ Not Available'}")

if baseline_results and augmented_results:
    print(f"\n📊 COMPREHENSIVE EVALUATION RESULTS:")
    print(f"  Evaluation Dataset: {eval_set_name} set ({eval_count} images)")
    print(f"  Overall Result: {comparison_data['overall_result'].replace('_', ' ').title()}")
    
    # Key improvements
    print(f"\n🎯 KEY IMPROVEMENTS:")
    for metric, data in improvements.items():
        metric_name = metric.replace('_mean', '').upper()
        if data['percentage_improvement'] > 1:
            print(f"  ✅ {metric_name}: +{data['percentage_improvement']:.2f}% improvement")
        elif data['percentage_improvement'] < -1:
            print(f"  ❌ {metric_name}: {data['percentage_improvement']:.2f}% regression")
        else:
            print(f"  ➖ {metric_name}: {data['percentage_improvement']:.2f}% (minimal change)")
    
    # Best performer
    better_model = "Augmented" if comparison_data['overall_result'] in ['significant_improvement', 'moderate_improvement'] else "Baseline"
    print(f"\n🏆 RECOMMENDED MODEL: {better_model} Model")

else:
    print(f"\n⚠️ Comprehensive comparison not available")
    print(f"  Missing evaluation results for detailed comparison")

# Files generated
print(f"\n📁 FILES GENERATED:")
generated_files = [
    'detailed_comparison.yaml',
    'metrics_comparison.png',
    'improvement_radar.png',
    'side_by_side_comparison.png',
    'training_progress_comparison.png'
]

for file in generated_files:
    file_path = os.path.join(comparison_results_dir, file)
    if os.path.exists(file_path):
        print(f"  ✅ {file}")
    else:
        print(f"  ❌ {file} (not created)")

# Recommendations
print(f"\n💡 RECOMMENDATIONS:")

if baseline_results and augmented_results:
    if comparison_data['overall_result'] == 'significant_improvement':
        print(f"  🎉 Data augmentation was highly successful!")
        print(f"  📈 Use the augmented model for production")
        print(f"  🔧 Current augmentation strategy is working well")
        print(f"  📊 Consider fine-tuning augmentation parameters for even better results")
    
    elif comparison_data['overall_result'] == 'moderate_improvement':
        print(f"  ✅ Data augmentation provided modest improvements")
        print(f"  📈 Use the augmented model, but gains are moderate")
        print(f"  🔧 Consider experimenting with different augmentation strategies")
        print(f"  📊 Additional techniques (transfer learning, architecture changes) may help")
    
    elif comparison_data['overall_result'] == 'slight_improvement':
        print(f"  ➖ Data augmentation provided minimal improvement")
        print(f"  🤔 Either model can be used - difference is small")
        print(f"  🔧 Try different augmentation approaches or strategies")
        print(f"  📊 Focus on other improvements (model architecture, training time)")
    
    else:  # no_improvement
        print(f"  ⚠️ Data augmentation did not improve performance")
        print(f"  📈 Use the baseline model (simpler and faster)")
        print(f"  🔧 Current augmentation strategy may not suit your dataset")
        print(f"  📊 Consider: less aggressive augmentation, different techniques, or focus on other improvements")

else:
    print(f"  📝 Complete both baseline and augmented training for full comparison")
    print(f"  🔄 Run notebooks 04 and 05 if not already completed")
    print(f"  📊 Ensure adequate evaluation data is available")

# Next steps
print(f"\n🎯 NEXT STEPS:")
print(f"  1. 🖥️ Test models interactively with notebook 07_gui_demo.ipynb")
print(f"  2. 📊 Use the best model for your specific colorization needs")
print(f"  3. 🔧 Consider additional improvements based on results")
print(f"  4. 📝 Document your findings for future reference")

# Save final report
final_report = {
    'report_date': datetime.now().isoformat(),
    'models_evaluated': {
        'baseline_available': baseline_available,
        'augmented_available': augmented_available
    },
    'evaluation_summary': comparison_data if 'comparison_data' in locals() else None,
    'recommendations': {
        'recommended_model': better_model if 'better_model' in locals() else 'Unknown',
        'overall_result': comparison_data['overall_result'] if 'comparison_data' in locals() else 'Unknown'
    },
    'files_generated': [f for f in generated_files if os.path.exists(os.path.join(comparison_results_dir, f))]
}

final_report_path = os.path.join(comparison_results_dir, 'final_evaluation_report.yaml')
with open(final_report_path, 'w') as f:
    yaml.dump(final_report, f, default_flow_style=False)

print(f"\n📋 Final report saved to: {final_report_path}")
print(f"\n🎉 COMPREHENSIVE EVALUATION COMPLETED!")
print(f"Ready to use your trained models for image colorization! 🎨")

FINAL EVALUATION REPORT
Models Evaluated:


NameError: name 'baseline_available' is not defined