# Results Analysis - Model Performance Evaluation

This notebook analyzes the trained model's performance and creates publication-quality visualizations for thesis presentation.

**Target Performance Metrics:**
- DSC (Dice Similarity Coefficient): ≥97.81%
- mIoU (Mean Intersection over Union): ≥97.90%
- mPA (Mean Pixel Accuracy): ≥99.18%

**Contents:**
1. Load trained model weights
2. Comprehensive metrics evaluation
3. Visualization of predictions
4. Error analysis
5. Comparison with baseline/targets

## 1. Import Libraries

In [None]:
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader
from pathlib import Path
import pandas as pd

sys.path.append('..')

from src.data import HC18Dataset
from src.models import ImprovedUNet
from src.losses import DiceBCELoss
from src.utils import get_transforms
from train import evaluate_model

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 8)
plt.rcParams['figure.dpi'] = 100

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

## 2. Load Trained Model

In [None]:
# Load best model weights
MODEL_PATH = '../weights/best_model.pth'

# Initialize model
model = ImprovedUNet(in_channels=1, out_channels=1).to(DEVICE)

# Load checkpoint
if Path(MODEL_PATH).exists():
    checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    print("="*60)
    print("MODEL LOADED SUCCESSFULLY")
    print("="*60)
    print(f"Checkpoint Epoch: {checkpoint.get('epoch', 'N/A')}")
    print(f"Best Dice Score: {checkpoint.get('best_dice', 'N/A'):.4f}")
    if 'val_metrics' in checkpoint:
        print(f"Val mIoU: {checkpoint['val_metrics'].get('miou', 'N/A'):.4f}")
        print(f"Val mPA: {checkpoint['val_metrics'].get('pixel_accuracy', 'N/A'):.4f}")
    print("="*60)
else:
    print(f"⚠️ Model weights not found at {MODEL_PATH}")
    print("Please train the model first using main.py or the training notebook")

model.eval()

## 3. Load Test Data

In [None]:
# Load test dataset
TEST_IMG_DIR = '../dataset/test_set/images'
TEST_MASK_DIR = '../dataset/test_set/masks'

test_transforms = get_transforms(256, 256, is_train=False)
test_dataset = HC18Dataset(TEST_IMG_DIR, TEST_MASK_DIR, transform=test_transforms)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0)

print(f"Test samples: {len(test_dataset)}")
print(f"Test batches: {len(test_loader)}")

## 4. Evaluate Model Performance

In [None]:
# Evaluate on test set
loss_fn = DiceBCELoss()
test_metrics = evaluate_model(test_loader, model, loss_fn, DEVICE)

# Target metrics from paper
TARGET_DSC = 0.9781
TARGET_MIOU = 0.9790
TARGET_MPA = 0.9918

# Display results
print("="*60)
print("TEST SET PERFORMANCE")
print("="*60)
print(f"Loss:            {test_metrics['loss']:.4f}")
print(f"DSC (Dice):      {test_metrics['dice']:.4f}  (Target: {TARGET_DSC:.4f})")
print(f"mIoU:            {test_metrics['miou']:.4f}  (Target: {TARGET_MIOU:.4f})")
print(f"Pixel Accuracy:  {test_metrics['pixel_accuracy']:.4f}  (Target: {TARGET_MPA:.4f})")
print("="*60)

# Compare with targets
metrics_comparison = {
    'Metric': ['DSC', 'mIoU', 'Pixel Accuracy'],
    'Our Model': [test_metrics['dice'], test_metrics['miou'], test_metrics['pixel_accuracy']],
    'Target (Paper)': [TARGET_DSC, TARGET_MIOU, TARGET_MPA]
}
df_comparison = pd.DataFrame(metrics_comparison)
df_comparison['Difference'] = df_comparison['Our Model'] - df_comparison['Target (Paper)']
df_comparison['Status'] = df_comparison['Difference'].apply(lambda x: '✓ Achieved' if x >= 0 else '✗ Below target')

print("\nMETRICS COMPARISON:")
print(df_comparison.to_string(index=False))

## 5. Visualize Results Comparison

In [None]:
# Target metrics
TARGET_DSC = 0.9781
TARGET_MIOU = 0.9790
TARGET_MPA = 0.9918

# Create comparison dataframe
metrics_comparison = {
    'Metric': ['DSC', 'mIoU', 'mPA'],
    'Our Model': [DSC_SCORE, MIOU_SCORE, MPA_SCORE],
    'Target': [TARGET_DSC, TARGET_MIOU, TARGET_MPA]
}

df_comparison = pd.DataFrame(metrics_comparison)
df_comparison['Difference'] = df_comparison['Our Model'] - df_comparison['Target']

print("\n" + "="*70)
print("Performance Comparison")
print("="*70)
print(df_comparison.to_string(index=False))
print("="*70)

# Bar chart comparison
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(metrics_comparison['Metric']))
width = 0.35

bars1 = ax.bar(x - width/2, metrics_comparison['Our Model'], width, 
               label='Our Model', color='steelblue', alpha=0.8)
bars2 = ax.bar(x + width/2, metrics_comparison['Target'], width, 
               label='Target', color='coral', alpha=0.8)

ax.set_xlabel('Metrics', fontsize=12, fontweight='bold')
ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_title('Model Performance vs Target Metrics', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(metrics_comparison['Metric'])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim([0.95, 1.0])

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

## 6. Prediction Visualizations

In [None]:
# Visualize predictions on test samples
n_samples = 10
fig, axes = plt.subplots(n_samples, 4, figsize=(16, 4*n_samples))

sample_count = 0
with torch.no_grad():
    for images, masks in test_loader:
        if sample_count >= n_samples:
            break
        
        images_gpu = images.to(DEVICE)
        predictions = model(images_gpu)
        
        for i in range(images.size(0)):
            if sample_count >= n_samples:
                break
            
            img = images[i].squeeze().cpu().numpy()
            mask = masks[i].squeeze().cpu().numpy()
            pred = predictions[i].squeeze().cpu().numpy()
            pred_binary = (pred > 0.5).astype(np.float32)
            
            # Error map
            error = np.abs(mask - pred_binary)
            
            # Plot
            axes[sample_count, 0].imshow(img, cmap='gray')
            axes[sample_count, 0].set_title('Input Image', fontsize=12)
            axes[sample_count, 0].axis('off')
            
            axes[sample_count, 1].imshow(mask, cmap='gray')
            axes[sample_count, 1].set_title('Ground Truth', fontsize=12)
            axes[sample_count, 1].axis('off')
            
            axes[sample_count, 2].imshow(pred_binary, cmap='gray')
            axes[sample_count, 2].set_title('Prediction', fontsize=12)
            axes[sample_count, 2].axis('off')
            
            axes[sample_count, 3].imshow(error, cmap='hot', vmin=0, vmax=1)
            axes[sample_count, 3].set_title('Error Map', fontsize=12)
            axes[sample_count, 3].axis('off')
            
            sample_count += 1

plt.tight_layout()
plt.suptitle('Segmentation Results on Test Set', y=1.001, fontsize=16, fontweight='bold')
plt.show()

## 7. Summary & Conclusion

### Key Findings:

1. **Model Performance:**
   - The Improved U-Net successfully segments fetal heads from ultrasound images
   - Performance metrics are compared against target benchmarks
   
2. **Architecture Highlights:**
   - Residual blocks for better gradient flow
   - ASPP module for multi-scale context
   - Feature Pyramid + Scale Attention for enhanced feature fusion
   
3. **Next Steps:**
   - If targets not met: hyperparameter tuning, more epochs, data augmentation
   - If targets met: model optimization for inference speed
   - Prepare publication-quality figures for thesis defense

---

**For full training (100 epochs), use:** `python main.py`