In [None]:
# ============================================================================
# CELLA 1: Setup & Mount Google Drive
# ============================================================================

from google.colab import drive
import sys
import os
from pathlib import Path

print(" Task 4: PF-WILLOW Zero-Shot Evaluation")
print(" Using trained model from Task 2\n")

# 1. Mount Google Drive
if not Path('/content/drive').exists():
    drive.mount('/content/drive')
    print("‚úÖ Google Drive mounted\n")
else:
    print("‚úÖ Google Drive already mounted\n")

# 2. Setup directories
PROJECT_ROOT = '/content/drive/MyDrive/AML'
DATA_DIR = f'{PROJECT_ROOT}/dataset'
CHECKPOINT_DIR = f'{PROJECT_ROOT}/checkpoints'
RESULTS_DIR = f'{PROJECT_ROOT}/results'
FIGURES_DIR = f'{PROJECT_ROOT}/results/figures'

# Create directories
for dir_path in [DATA_DIR, CHECKPOINT_DIR, RESULTS_DIR, FIGURES_DIR]:
    os.makedirs(dir_path, exist_ok=True)

print(f" Project root: {PROJECT_ROOT}")
print(f" Checkpoints: {CHECKPOINT_DIR}")
print(f" Results: {RESULTS_DIR}\n")

# 3. Clone repository
GITHUB_REPO_URL = 'https://github.com/SamueleCarrea/AML_SemanticCorrespondence'
LOCAL_REPO_NAME = 'AML_SemanticCorrespondence'

if not Path(LOCAL_REPO_NAME).exists():
    print(f" Cloning repository...")
    !git clone {GITHUB_REPO_URL} {LOCAL_REPO_NAME}
    print("‚úÖ Repository cloned")
else:
    print(f"‚úÖ Repository already exists")
    if Path(LOCAL_REPO_NAME, '.git').exists():
        print("üîÑ Pulling latest changes...")
        %cd {LOCAL_REPO_NAME}
        !git pull
        %cd ..

sys.path.insert(0, LOCAL_REPO_NAME)

# 4. Check GPU
import torch
print(f"\n  GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")
if torch.cuda.is_available():
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

print("\n‚úÖ Setup complete!\n")

In [None]:
# ============================================================================
# CELLA 2: Install Dependencies
# ============================================================================

print(" Installing dependencies...\n")

# Install requirements
!pip install -q -r {LOCAL_REPO_NAME}/requirements.txt
!pip install -q scipy pandas pillow tqdm matplotlib seaborn

# Additional packages if needed for Task 2 model
!pip install -q pytorch-lightning

# Verify
import torch
import pytorch_lightning as pl
print(f"\n PyTorch {torch.__version__}")
print(f" Lightning {pl.__version__}")
print(f" CUDA available: {torch.cuda.is_available()}")

print("\n Dependencies installed!\n")

In [None]:
# ============================================================================
# CELLA 3: Download PF-WILLOW Dataset
# ============================================================================

import urllib.request
import zipfile
from pathlib import Path

WILLOW_ROOT = f'{DATA_DIR}/PF-WILLOW'

print(f"‚úÖ Dataset exists at {WILLOW_ROOT}")

# Verify structure
print("\n Dataset verification:")
willow_path = Path(WILLOW_ROOT)

# Count files in each subset
subsets = [
    'car(S)', 'car(M)', 'car(G)',
    'duck(S)',
    'motorbike(S)', 'motorbike(M)', 'motorbike(G)',
    'winebottle(wC)', 'winebottle(woC)', 'winebottle(M)'
]

print(f"\n{'Subset':<25} {'Images':<10} {'Annotations':<10}")
print("-" * 50)
for subset in subsets:
    subset_dir = willow_path / subset
    if subset_dir.exists():
        n_png = len(list(subset_dir.glob('*.png')))
        n_mat = len(list(subset_dir.glob('*.mat')))
        print(f"{subset:<25} {n_png:<10} {n_mat:<10}")

# Check test_pairs.csv
if (willow_path / 'test_pairs.csv').exists():
    import pandas as pd
    pairs = pd.read_csv(willow_path / 'test_pairs.csv')
    print(f"\n‚úÖ Found {len(pairs)} test pairs in test_pairs.csv")
    print(f"‚úÖ Categories: {pairs['category'].unique().tolist()}")
else:
    print("\n  WARNING: test_pairs.csv not found!")

print("\n‚úÖ Dataset ready!\n")

In [None]:
# ============================================================================
# CELLA 4: Load PF-WILLOW Dataset
# ============================================================================

from dataset.willow import PFWillowDataset
from torch.utils.data import DataLoader

WILLOW_ROOT = f'{DATA_DIR}/PF-WILLOW'

print(" Loading PF-WILLOW dataset...\n")

# Load dataset
test_dataset = PFWillowDataset(
    root=WILLOW_ROOT,
    long_side=518,
    normalize=True
)

# Create DataLoader (batch_size=1 importante!)
test_loader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    collate_fn=PFWillowDataset.collate_fn
)

# Sanity check
sample = test_dataset[0]
print(f"\n Dataset Info:")
print(f"  Total pairs: {len(test_dataset)}")
print(f"  Categories: {test_dataset.categories}")
print(f"  Keypoints per pair: {test_dataset.N_KEYPOINTS}")

print(f"\n Sample pair:")
print(f"  Category: {sample['category']}")
print(f"  Source image shape: {sample['src_img'].shape}")
print(f"  Target image shape: {sample['tgt_img'].shape}")
print(f"  Source keypoints: {sample['src_kps'].shape}")
print(f"  Valid keypoints: {sample['valid_mask'].sum().item()}/{len(sample['valid_mask'])}")

print("\n‚úÖ Dataset loaded successfully!")

In [None]:
# ============================================================================
# CELLA 5: Load Your Trained Model from Task 2
# ============================================================================

import torch
import pytorch_lightning as pl

# IMPORTANTE: Aggiorna questi path con i tuoi file
CHECKPOINT_PATH = f"{CHECKPOINT_DIR}/best_model_task2.ckpt"

# Verifica che il checkpoint esista
if not Path(CHECKPOINT_PATH).exists():
    print(f" Checkpoint not found: {CHECKPOINT_PATH}")
    print(f"\n Available checkpoints in {CHECKPOINT_DIR}:")
    for ckpt in Path(CHECKPOINT_DIR).glob("*.ckpt"):
        print(f"  - {ckpt.name}")

    # Se hai un checkpoint con nome diverso, aggiornalo qui:
    print(f"\n Update CHECKPOINT_PATH variable with your actual checkpoint name")
else:
    print(f"‚úÖ Checkpoint found: {CHECKPOINT_PATH}")

# IMPORTANTE: Importa il tuo modello Task 2
# Sostituisci questa linea con il tuo import reale:

# Opzione 1: Se il tuo modello √® in models/correspondence_model.py
# from models.correspondence_model import YourModelClass

# Opzione 2: Se usi un file diverso
# sys.path.insert(0, f'{LOCAL_REPO_NAME}/models')
# from your_model_file import YourModelClass

# Per ora uso un placeholder - DEVI CAMBIARE QUESTO!
print("\n‚ö†Ô∏è  ATTENZIONE: Devi aggiornare l'import del modello!")
print("   Vedi commenti nella cella per istruzioni\n")

# ESEMPIO DI CARICAMENTO (aggiorna con il tuo modello reale):
# model = YourModelClass.load_from_checkpoint(CHECKPOINT_PATH)

# Placeholder per evitare errori (RIMUOVI QUESTO):
class PlaceholderModel(pl.LightningModule):
    def predict(self, src_img, tgt_img, src_kps):
        # Questo √® solo un placeholder!
        # Ritorna keypoints casuali - NON USARE IN PRODUZIONE
        return src_kps + torch.randn_like(src_kps) * 10

# SOSTITUISCI con:
# model = YourModelClass.load_from_checkpoint(CHECKPOINT_PATH)
model = PlaceholderModel()

print("‚úÖ Model loaded (update with your real model!)\n")

In [None]:
# ============================================================================
# CELLA 6: PCK Metric Implementation
# ============================================================================

import numpy as np
from typing import Dict, List


def compute_pck(
    pred_kps: torch.Tensor,
    gt_kps: torch.Tensor,
    image_size: tuple,
    thresholds: List[float] = [0.05, 0.10, 0.15, 0.20]
) -> Dict[str, float]:
    """
    Compute Percentage of Correct Keypoints (PCK).

    Args:
        pred_kps: (N, 2) predicted keypoints (x, y)
        gt_kps: (N, 2) ground truth keypoints (x, y)
        image_size: (H, W) image dimensions
        thresholds: List of alpha values

    Returns:
        Dictionary with PCK@alpha for each threshold
    """
    H, W = image_size
    max_dim = max(H, W)

    # Compute Euclidean distance
    distances = torch.norm(pred_kps - gt_kps, dim=1)  # (N,)

    # Normalize by max image dimension
    normalized_distances = distances / max_dim

    # Compute PCK for each threshold
    results = {}
    for alpha in thresholds:
        correct = (normalized_distances <= alpha).float()
        pck = correct.mean().item() * 100  # Convert to percentage
        results[f'PCK@{alpha:.2f}'] = pck

    return results


# Test metrics
print(" Testing PCK computation...\n")

# Example: perfect prediction
pred_perfect = torch.tensor([[100.0, 150.0], [200.0, 250.0]])
gt_perfect = torch.tensor([[100.0, 150.0], [200.0, 250.0]])
img_size = (480, 640)

pck_perfect = compute_pck(pred_perfect, gt_perfect, img_size)
print("Perfect prediction:")
for metric, value in pck_perfect.items():
    print(f"  {metric}: {value:.2f}%")

# Example: imperfect prediction
pred_imperfect = torch.tensor([[105.0, 155.0], [195.0, 245.0]])
pck_imperfect = compute_pck(pred_imperfect, gt_perfect, img_size)
print("\nImperfect prediction (5px error):")
for metric, value in pck_imperfect.items():
    print(f"  {metric}: {value:.2f}%")

print("\n‚úÖ PCK metric ready!")

In [None]:
# ============================================================================
# CELLA 7: Evaluation Function
# ============================================================================

from tqdm import tqdm
from collections import defaultdict


@torch.no_grad()
def evaluate_on_pf_willow(
    model,
    dataloader,
    device='cuda',
    thresholds=[0.05, 0.10, 0.15, 0.20]
):
    """
    Evaluate trained model on PF-WILLOW dataset.

    Args:
        model: Trained model from Task 2
        dataloader: PF-WILLOW DataLoader
        device: 'cuda' or 'cpu'
        thresholds: PCK thresholds [0.05, 0.10, 0.15, 0.20]

    Returns:
        dict: Evaluation results
    """
    model.eval()
    model.to(device)

    # Initialize storage
    all_results = {f'PCK@{t:.2f}': [] for t in thresholds}
    category_results = {}

    print(f"\n{'='*70}")
    print("EVALUATING ON PF-WILLOW")
    print(f"{'='*70}")
    print(f"Thresholds: {thresholds}")
    print(f"Device: {device}")
    print(f"Total pairs: {len(dataloader)}\n")

    # Evaluation loop
    for batch in tqdm(dataloader, desc="Evaluating"):
        # Move to device
        src_img = batch['src_img'].to(device)
        tgt_img = batch['tgt_img'].to(device)
        src_kps = batch['src_kps'].to(device)
        tgt_kps = batch['tgt_kps'].to(device)
        valid_mask = batch['valid_mask'].to(device)
        category = batch['category'][0]  # batch_size=1
        tgt_size = batch['tgt_size'][0]  # (H, W)

        # Get valid keypoints
        valid_idx = valid_mask[0]

        if valid_idx.sum() == 0:
            continue

        # Predict correspondences
        # IMPORTANTE: Il modello deve avere un metodo predict()
        pred_kps = model.predict(src_img, tgt_img, src_kps)

        # Extract valid predictions
        pred_valid = pred_kps[0][valid_idx]  # (N_valid, 2)
        gt_valid = tgt_kps[0][valid_idx]     # (N_valid, 2)

        # Compute PCK
        H, W = tgt_size.tolist()
        pck_scores = compute_pck(
            pred_valid.cpu(),
            gt_valid.cpu(),
            (H, W),
            thresholds
        )

        # Store results
        for key, value in pck_scores.items():
            all_results[key].append(value)

        # Per-category results
        if category not in category_results:
            category_results[category] = {k: [] for k in pck_scores.keys()}

        for key, value in pck_scores.items():
            category_results[category][key].append(value)

    # Aggregate results
    final_results = {
        'num_pairs': len(all_results['PCK@0.10']),
        'overall': {},
        'per_category': {}
    }

    # Overall metrics
    for metric in [f'PCK@{t:.2f}' for t in thresholds]:
        values = all_results[metric]
        final_results['overall'][metric] = {
            'mean': np.mean(values),
            'std': np.std(values),
            'values': values  # Keep all values for analysis
        }

    # Per-category metrics
    for cat, metrics in category_results.items():
        final_results['per_category'][cat] = {}
        for metric in [f'PCK@{t:.2f}' for t in thresholds]:
            final_results['per_category'][cat][metric] = np.mean(metrics[metric])

    return final_results


print("‚úÖ Evaluation function ready!")

In [None]:
# ============================================================================
# CELLA 8: Run Evaluation on PF-WILLOW
# ============================================================================

import json

device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f" Starting PF-WILLOW evaluation")
print(f"   Device: {device}")
print(f"   Model: Task 2 trained model")
print(f"   Test pairs: {len(test_dataset)}\n")

# Run evaluation
try:
    results = evaluate_on_pf_willow(
        model=model,
        dataloader=test_loader,
        device=device,
        thresholds=[0.05, 0.10, 0.15, 0.20]
    )

    # Print results
    print(f"\n{'='*70}")
    print("PF-WILLOW EVALUATION RESULTS")
    print(f"{'='*70}")

    print(f"\n Overall Performance:")
    print("-" * 70)
    for metric in ['PCK@0.05', 'PCK@0.10', 'PCK@0.15', 'PCK@0.20']:
        mean_val = results['overall'][metric]['mean']
        std_val = results['overall'][metric]['std']
        print(f"  {metric}: {mean_val:.2f}% ¬± {std_val:.2f}%")

    print(f"\n Per-Category Performance:")
    print("-" * 70)
    for cat in sorted(results['per_category'].keys()):
        metrics = results['per_category'][cat]
        print(f"\n  {cat.upper()}:")
        for metric in ['PCK@0.05', 'PCK@0.10', 'PCK@0.15', 'PCK@0.20']:
            value = metrics[metric]
            print(f"    {metric}: {value:.2f}%")

    print(f"\n{'='*70}")

    # Save results
    output_file = f'{RESULTS_DIR}/pf_willow_task2_results.json'
    with open(output_file, 'w') as f:
        # Convert to serializable format
        save_results = {
            'num_pairs': results['num_pairs'],
            'overall': {k: {'mean': v['mean'], 'std': v['std']}
                       for k, v in results['overall'].items()},
            'per_category': results['per_category']
        }
        json.dump(save_results, f, indent=2)

    print(f"\n Results saved to: {output_file}")

except Exception as e:
    print(f"\n‚ùå Error during evaluation: {e}")
    import traceback
    traceback.print_exc()

print("\n‚úÖ Evaluation complete!")

In [None]:
# ============================================================================
# CELLA 9: Visualizations - PCK Curves
# ============================================================================

import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

def plot_pck_curves(results, save_dir):
    """Plot PCK curves across different thresholds."""

    thresholds = [0.05, 0.10, 0.15, 0.20]

    fig, axes = plt.subplots(1, 2, figsize=(15, 6))

    # ========================================================================
    # Plot 1: Overall PCK Curve
    # ========================================================================
    ax1 = axes[0]

    pck_means = [results['overall'][f'PCK@{t:.2f}']['mean'] for t in thresholds]
    pck_stds = [results['overall'][f'PCK@{t:.2f}']['std'] for t in thresholds]

    ax1.plot(thresholds, pck_means, marker='o', linewidth=3,
             markersize=10, color='#e74c3c', label='Overall PCK')
    ax1.fill_between(thresholds,
                     [m - s for m, s in zip(pck_means, pck_stds)],
                     [m + s for m, s in zip(pck_means, pck_stds)],
                     alpha=0.2, color='#e74c3c')

    ax1.set_xlabel('Threshold (Œ±)', fontsize=14, fontweight='bold')
    ax1.set_ylabel('PCK (%)', fontsize=14, fontweight='bold')
    ax1.set_title('PF-WILLOW: Overall PCK Curve', fontsize=16, fontweight='bold')
    ax1.grid(True, alpha=0.3)
    ax1.set_ylim([0, 100])
    ax1.legend(fontsize=12)

    # Add value labels
    for t, mean in zip(thresholds, pck_means):
        ax1.text(t, mean + 2, f'{mean:.1f}%',
                ha='center', fontsize=10, fontweight='bold')

    # ========================================================================
    # Plot 2: Per-Category PCK@0.10
    # ========================================================================
    ax2 = axes[1]

    categories = sorted(results['per_category'].keys())
    pck_010_per_cat = [results['per_category'][cat]['PCK@0.10'] for cat in categories]

    colors = sns.color_palette('husl', len(categories))
    bars = ax2.bar(categories, pck_010_per_cat, color=colors, alpha=0.8,
                   edgecolor='black', linewidth=1.5)

    ax2.set_ylabel('PCK@0.10 (%)', fontsize=14, fontweight='bold')
    ax2.set_title('PF-WILLOW: Per-Category Performance', fontsize=16, fontweight='bold')
    ax2.set_ylim([0, 100])
    ax2.grid(axis='y', alpha=0.3)

    # Add value labels
    for bar, val in zip(bars, pck_010_per_cat):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2,
                f'{val:.1f}%', ha='center', va='bottom',
                fontsize=11, fontweight='bold')

    plt.tight_layout()

    # Save
    save_path = f"{save_dir}/pf_willow_pck_analysis.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f" Saved: {save_path}")

    plt.show()


# Generate plot
if 'results' in locals():
    plot_pck_curves(results, FIGURES_DIR)
else:
    print("‚ö†Ô∏è  No results available. Run evaluation first!")

In [None]:
# ============================================================================
# CELLA 10: Visualizations - Detailed Heatmap
# ============================================================================

def plot_detailed_heatmap(results, save_dir):
    """Plot heatmap of PCK across categories and thresholds."""

    fig, ax = plt.subplots(figsize=(12, 6))

    # Prepare data
    categories = sorted(results['per_category'].keys())
    thresholds = [0.05, 0.10, 0.15, 0.20]
    threshold_labels = [f'PCK@{t:.2f}' for t in thresholds]

    # Create heatmap data
    heatmap_data = []
    for cat in categories:
        row = [results['per_category'][cat][label] for label in threshold_labels]
        heatmap_data.append(row)

    # Plot heatmap
    im = ax.imshow(heatmap_data, cmap='RdYlGn', aspect='auto', vmin=0, vmax=100)

    # Set ticks
    ax.set_xticks(range(len(threshold_labels)))
    ax.set_yticks(range(len(categories)))
    ax.set_xticklabels(threshold_labels, fontsize=12)
    ax.set_yticklabels(categories, fontsize=12)

    # Add text annotations
    for i in range(len(categories)):
        for j in range(len(threshold_labels)):
            text = ax.text(j, i, f'{heatmap_data[i][j]:.1f}',
                          ha="center", va="center",
                          color="black", fontsize=11, fontweight='bold')

    ax.set_title('PF-WILLOW: PCK Heatmap (Categories √ó Thresholds)',
                fontsize=16, fontweight='bold')

    # Colorbar
    cbar = plt.colorbar(im, ax=ax)
    cbar.set_label('PCK (%)', fontsize=12, fontweight='bold')

    plt.tight_layout()

    # Save
    save_path = f"{save_dir}/pf_willow_heatmap.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f" Saved: {save_path}")

    plt.show()


# Generate heatmap
if 'results' in locals():
    plot_detailed_heatmap(results, FIGURES_DIR)
else:
    print("‚ö†Ô∏è  No results available. Run evaluation first!")

In [None]:
# ============================================================================
# CELLA 11: Compare SPair-71k vs PF-WILLOW (Zero-Shot Transfer)
# ============================================================================

print(" Zero-Shot Transfer Analysis: SPair-71k ‚Üí PF-WILLOW\n")

# Try to load SPair-71k results if available
spair_results_file = f'{RESULTS_DIR}/spair_task2_results.json'

if Path(spair_results_file).exists():
    with open(spair_results_file, 'r') as f:
        spair_results = json.load(f)

    print("‚úÖ Found SPair-71k results for comparison\n")

    # Create comparison table
    print(f"{'='*80}")
    print("ZERO-SHOT GENERALIZATION ANALYSIS")
    print(f"{'='*80}")
    print(f"\n{'Metric':<15} {'SPair-71k (train)':<20} {'PF-WILLOW (zero-shot)':<25} {'Drop':<10}")
    print("-" * 80)

    for metric in ['PCK@0.05', 'PCK@0.10', 'PCK@0.15', 'PCK@0.20']:
        if metric in spair_results['overall'] and metric in results['overall']:
            spair_val = spair_results['overall'][metric]['mean']
            willow_val = results['overall'][metric]['mean']
            drop = spair_val - willow_val

            print(f"{metric:<15} {spair_val:>18.2f}% {willow_val:>23.2f}% {drop:>9.2f}%")

    print(f"{'='*80}")

    # Visualize comparison
    fig, ax = plt.subplots(figsize=(10, 6))

    thresholds = [0.05, 0.10, 0.15, 0.20]
    spair_pck = [spair_results['overall'][f'PCK@{t:.2f}']['mean'] for t in thresholds]
    willow_pck = [results['overall'][f'PCK@{t:.2f}']['mean'] for t in thresholds]

    x = range(len(thresholds))
    width = 0.35

    ax.bar([i - width/2 for i in x], spair_pck, width,
           label='SPair-71k (train)', color='#3498db', alpha=0.8)
    ax.bar([i + width/2 for i in x], willow_pck, width,
           label='PF-WILLOW (zero-shot)', color='#e74c3c', alpha=0.8)

    ax.set_xlabel('PCK Threshold', fontsize=12, fontweight='bold')
    ax.set_ylabel('PCK (%)', fontsize=12, fontweight='bold')
    ax.set_title('Zero-Shot Transfer: SPair-71k ‚Üí PF-WILLOW',
                fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([f'{t:.2f}' for t in thresholds])
    ax.legend(fontsize=11)
    ax.grid(axis='y', alpha=0.3)
    ax.set_ylim([0, 100])

    plt.tight_layout()
    save_path = f"{FIGURES_DIR}/zero_shot_comparison.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"\n Saved: {save_path}")
    plt.show()

else:
    print(f"‚ö†Ô∏è  SPair-71k results not found at: {spair_results_file}")
    print("   Run Task 2 evaluation on SPair-71k first for comparison")

In [None]:
# ============================================================================
# CELLA 12: Summary & Export
# ============================================================================

print("\n" + "="*80)
print("TASK 4 SUMMARY - PF-WILLOW ZERO-SHOT EVALUATION")
print("="*80)

if 'results' in locals():
    print(f"\n Evaluation Statistics:")
    print(f"  Total test pairs: {results['num_pairs']}")
    print(f"  Categories evaluated: {len(results['per_category'])}")

    print(f"\n Key Results:")
    for metric in ['PCK@0.05', 'PCK@0.10', 'PCK@0.15', 'PCK@0.20']:
        mean_val = results['overall'][metric]['mean']
        std_val = results['overall'][metric]['std']
        print(f"  {metric}: {mean_val:.2f}% (¬±{std_val:.2f}%)")

    # Find best and worst categories
    pck_010_per_cat = {cat: results['per_category'][cat]['PCK@0.10']
                       for cat in results['per_category'].keys()}

    best_cat = max(pck_010_per_cat.items(), key=lambda x: x[1])
    worst_cat = min(pck_010_per_cat.items(), key=lambda x: x[1])

    print(f"\n Best category (PCK@0.10):")
    print(f"  {best_cat[0]}: {best_cat[1]:.2f}%")

    print(f"\n Most challenging category (PCK@0.10):")
    print(f"  {worst_cat[0]}: {worst_cat[1]:.2f}%")

    # Export summary
    summary = {
        'dataset': 'PF-WILLOW',
        'num_pairs': results['num_pairs'],
        'model': 'Task 2 Trained Model',
        'overall_pck': {k: v['mean'] for k, v in results['overall'].items()},
        'best_category': {'name': best_cat[0], 'pck@0.10': best_cat[1]},
        'worst_category': {'name': worst_cat[0], 'pck@0.10': worst_cat[1]}
    }

    summary_file = f'{RESULTS_DIR}/task4_summary.json'
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"\nüíæ Summary saved to: {summary_file}")

print(f"\n All results saved in: {RESULTS_DIR}")
print(f"All figures saved in: {FIGURES_DIR}")

print("\n" + "="*80)
print("‚úÖ TASK 4 COMPLETE!")
print("="*80)

In [None]:
# ============================================================================
# CELLA 13: Export Results for Report
# ============================================================================

import pandas as pd

if 'results' in locals():
    print("üìÑ Creating report-ready tables...\n")

    # Table 1: Overall Results
    overall_df = pd.DataFrame([
        {
            'Metric': metric,
            'Mean (%)': f"{results['overall'][metric]['mean']:.2f}",
            'Std (%)': f"{results['overall'][metric]['std']:.2f}"
        }
        for metric in ['PCK@0.05', 'PCK@0.10', 'PCK@0.15', 'PCK@0.20']
    ])

    print("Table 1: Overall Performance")
    print(overall_df.to_string(index=False))

    # Save as CSV
    overall_df.to_csv(f'{RESULTS_DIR}/pf_willow_overall.csv', index=False)
    print(f"\n Saved: {RESULTS_DIR}/pf_willow_overall.csv")

    # Table 2: Per-Category Results
    per_cat_data = []
    for cat in sorted(results['per_category'].keys()):
        row = {'Category': cat}
        for metric in ['PCK@0.05', 'PCK@0.10', 'PCK@0.15', 'PCK@0.20']:
            row[metric] = f"{results['per_category'][cat][metric]:.2f}"
        per_cat_data.append(row)

    per_cat_df = pd.DataFrame(per_cat_data)

    print(f"\nTable 2: Per-Category Performance")
    print(per_cat_df.to_string(index=False))

    # Save as CSV
    per_cat_df.to_csv(f'{RESULTS_DIR}/pf_willow_per_category.csv', index=False)
    print(f"\n Saved: {RESULTS_DIR}/pf_willow_per_category.csv")

    print("\n‚úÖ Report tables exported!")

else:
    print("‚ö†Ô∏è  No results available. Run evaluation first!")