In [6]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.166-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [7]:
import os
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
import cv2
from tqdm import tqdm
import warnings

import torch
from ultralytics import YOLO
import json
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

from sklearn.metrics import precision_recall_fscore_support
import numpy as np

warnings.filterwarnings('ignore')

Configuration loaded for experiment: pascal_voc_yolo_evaluation
Using device: cuda


In [8]:
# Configuration Dictionary
CONFIG = {
    # Experiment Settings
    'experiment_name': 'pascal_voc_yolo_evaluation',
    'timestamp': datetime.now().strftime('%Y%m%d_%H%M%S'),

    # Paths
    'dataset_path': '/kaggle/input/pascal-voc-test/pascal_voc',
    'annotations_path': '/kaggle/input/pascal-voc-test/pascal_voc/annotations/instances_test.json',
    'images_path': '/kaggle/input/pascal-voc-test/pascal_voc/test',
    'models_path': '/kaggle/input/m/yardnh/temp/pytorch/temp/1',
    'output_base_path': '/kaggle/working/evaluation_results',

    # Model Testing Settings
    'confidence_threshold': 0.5,
    'iou_threshold': 0.5,
    'max_detections': 100,
    'image_size': 512,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',

    # Visualization Settings
    'num_sample_images': 20,
    'save_visualizations': True,
    'show_console_output': False,
    'plot_style': 'seaborn-v0_8',
    'figure_size': (12, 8),
    'dpi': 300,

    # Output Settings
    'save_coco_results': True,
    'save_metrics': True,
    'save_timing_info': True,
    'generate_markdown_report': True,

    # Metrics Settings
    'calculate_map': True,
    'map_iou_thresholds': [0.5, 0.75],
    'calculate_per_class_metrics': True,
}

print(f"Configuration loaded for experiment: {CONFIG['experiment_name']}")
print(f"Output directory: {CONFIG['output_base_path']}")

Directory structure created:
├── evaluation_results
    ├── visualizations
    ├── sample_images
    ├── predictions/ (will be created per model)
    ├── metrics.json
    └── experiment_report.md


In [9]:
def setup_directories():
    """Create the required directory structure"""
    base_path = Path(CONFIG['output_base_path'])

    directories = [
        base_path,
        base_path / 'visualizations',
        base_path / 'samples',
        base_path / 'coco_results',
        base_path / 'metrics',
        base_path / 'timing'
    ]

    for directory in directories:
        directory.mkdir(parents=True, exist_ok=True)

    print("Directory structure created successfully")
    return base_path

base_output_path = setup_directories()

Loaded 6418 images
Found 10959 annotations
Categories: ['person', 'cat', 'dog', 'bus', 'car']


In [10]:
def fix_coco_format(gt_data):
    """Fix COCO format by adding missing required fields"""
    if 'info' not in gt_data:
        gt_data['info'] = {
            'description': 'Pascal VOC Test Dataset',
            'url': '',
            'version': '1.0',
            'year': 2024,
            'contributor': '',
            'date_created': datetime.now().isoformat()
        }

    if 'licenses' not in gt_data:
        gt_data['licenses'] = []

    return gt_data

Found 6 model files:
  1. yolo8n_pt_512_128_orig_full.pt
  2. yolo8n_pt_512_64_sama_full.pt
  3. yolo8n_pt_512_16_sama_full.pt
  4. yolo8n_yaml_512_sama_full.pt
  5. yolo8n_pt_512_16_orig_full.pt
  6. yolo8n_yaml_512_orig_full.pt


In [11]:
def load_ground_truth():
    """Load ground truth annotations in COCO format"""
    with open(CONFIG['annotations_path'], 'r') as f:
        gt_data = json.load(f)

    # Fix COCO format if needed
    gt_data = fix_coco_format(gt_data)

    # Create category mapping - CRITICAL FIX
    category_mapping = {}
    yolo_to_coco = {}
    coco_to_yolo = {}

    print("Category mapping analysis:")
    for i, cat in enumerate(gt_data['categories']):
        category_mapping[i] = cat['id']  # YOLO class index -> COCO category ID
        yolo_to_coco[i] = cat['id']
        coco_to_yolo[cat['id']] = i
        print(f"  YOLO class {i} -> COCO category {cat['id']} ({cat['name']})")

    # Write fixed data to temporary file
    temp_gt_file = '/tmp/fixed_gt.json'
    with open(temp_gt_file, 'w') as f:
        json.dump(gt_data, f)

    # Load with COCO API
    coco_gt = COCO(temp_gt_file)

    print(f"\nGround truth loaded:")
    print(f"  Images: {len(gt_data['images'])}")
    print(f"  Annotations: {len(gt_data['annotations'])}")
    print(f"  Categories: {len(gt_data['categories'])}")

    return coco_gt, gt_data, category_mapping, yolo_to_coco, coco_to_yolo

def discover_models():
    """Discover all YOLO models in the models directory"""
    models_path = Path(CONFIG['models_path'])
    model_files = list(models_path.glob('*.pt'))

    print(f"\nFound {len(model_files)} YOLO models:")
    for model_file in model_files:
        print(f"  - {model_file.name}")

    return model_files

# Load data
coco_gt, gt_data, category_mapping, yolo_to_coco, coco_to_yolo = load_ground_truth()
model_files = discover_models()

YOLO Model wrapper ready


In [12]:
CONFIG.update({
    'validation_sample_size': 100,
    'show_validation_details': True,
    'category_mapping': category_mapping,
    'yolo_to_coco': yolo_to_coco,
    'coco_to_yolo': coco_to_yolo
})

In [13]:
def create_image_id_mapping(coco_gt, images_path):
    """Create mapping between image filenames and COCO image IDs"""
    image_files = list(Path(images_path).glob('*.jpg')) + list(Path(images_path).glob('*.png'))

    # Get image info from COCO ground truth
    coco_images = {img['file_name']: img['id'] for img in coco_gt.dataset['images']}

    # Create mapping for available images
    filename_to_id = {}
    id_to_filename = {}

    for img_file in image_files:
        img_filename = img_file.name

        # Try to find matching image in COCO dataset
        if img_filename in coco_images:
            img_id = coco_images[img_filename]
        else:
            # Try without extension or with different extension
            stem = img_file.stem
            matched_id = None
            for coco_filename, coco_id in coco_images.items():
                if Path(coco_filename).stem == stem:
                    matched_id = coco_id
                    break

            if matched_id is not None:
                img_id = matched_id
            else:
                continue

        filename_to_id[img_filename] = img_id
        id_to_filename[img_id] = img_filename

    print(f"Image mapping: {len(filename_to_id)} images matched to COCO IDs")
    return filename_to_id, id_to_filename

def test_single_model(model_path, images_path, coco_gt, category_mapping):
    """Test a single YOLO model and return results in COCO format"""
    model_name = Path(model_path).stem
    print(f"\nTesting model: {model_name}")

    # Load model
    model = YOLO(model_path)

    # Create image ID mapping
    filename_to_id, id_to_filename = create_image_id_mapping(coco_gt, images_path)

    # Get image list (only images that exist in ground truth)
    image_files = [Path(images_path) / filename for filename in filename_to_id.keys()]

    # Results storage
    coco_results = []
    timing_info = []

    # Process images
    for img_file in tqdm(image_files, desc=f"Processing {model_name}"):
        img_filename = img_file.name
        img_id = filename_to_id[img_filename]

        # Time inference
        start_time = time.time()
        results = model(str(img_file), conf=CONFIG['confidence_threshold'],
                       iou=CONFIG['iou_threshold'], max_det=CONFIG['max_detections'],
                       verbose=False, save=False)
        inference_time = time.time() - start_time

        timing_info.append({
            'image_id': img_id,
            'inference_time': inference_time,
            'image_file': img_filename
        })

        # Convert to COCO format - CRITICAL FIX for bbox format
        if results[0].boxes is not None:
            boxes = results[0].boxes.xyxy.cpu().numpy()  # x1,y1,x2,y2
            scores = results[0].boxes.conf.cpu().numpy()
            classes = results[0].boxes.cls.cpu().numpy()

            for i in range(len(boxes)):
                x1, y1, x2, y2 = boxes[i]
                yolo_class = int(classes[i])

                # Map YOLO class to COCO category
                if yolo_class in category_mapping:
                    coco_category = category_mapping[yolo_class]
                else:
                    print(f"Warning: Unknown YOLO class {yolo_class}")
                    continue

                # COCO bbox format: [x, y, width, height]
                bbox = [float(x1), float(y1), float(x2-x1), float(y2-y1)]

                # Validate bbox
                if bbox[2] <= 0 or bbox[3] <= 0:
                    continue

                coco_results.append({
                    'image_id': int(img_id),
                    'category_id': int(coco_category),
                    'bbox': bbox,
                    'score': float(scores[i])
                })

    print(f"  Generated {len(coco_results)} detections")
    return coco_results, timing_info, model_name

def test_all_models():
    """Test all discovered models"""
    all_results = {}

    for model_path in model_files:
        try:
            coco_results, timing_info, model_name = test_single_model(
                model_path, CONFIG['images_path'], coco_gt, category_mapping
            )

            all_results[model_name] = {
                'coco_results': coco_results,
                'timing_info': timing_info,
                'model_path': str(model_path)
            }

            # Save individual results
            if CONFIG['save_coco_results']:
                results_file = base_output_path / 'coco_results' / f'{model_name}_results.json'
                with open(results_file, 'w') as f:
                    json.dump(coco_results, f, indent=2)

        except Exception as e:
            print(f"Error testing model {model_path}: {e}")
            continue

    return all_results

# Test all models
print("Starting model evaluation...")
model_results = test_all_models()

Metrics calculation functions ready


In [14]:
def calculate_metrics(coco_results, coco_gt, model_name):
    """Calculate comprehensive metrics for a model"""

    if not coco_results:
        return create_empty_metrics()

    try:
        # Filter results to only include images that exist in ground truth
        valid_image_ids = set(coco_gt.getImgIds())
        filtered_results = [r for r in coco_results if r['image_id'] in valid_image_ids]

        if not filtered_results:
            print(f"No valid results found for {model_name} - no matching image IDs")
            return create_empty_metrics()

        # Validate category IDs
        valid_categories = set(coco_gt.getCatIds())
        category_valid_results = [r for r in filtered_results if r['category_id'] in valid_categories]

        if len(category_valid_results) != len(filtered_results):
            print(f"Warning: {len(filtered_results) - len(category_valid_results)} detections with invalid category IDs")

        filtered_results = category_valid_results

        if not filtered_results:
            print(f"No valid results found for {model_name} after category validation")
            return create_empty_metrics()

        print(f"Evaluating {len(filtered_results)} valid detections for {model_name}")

        # Create temporary results file for COCO evaluation
        temp_results_file = f'/tmp/{model_name}_temp_results.json'
        with open(temp_results_file, 'w') as f:
            json.dump(filtered_results, f)

        # Load results with COCO API
        coco_dt = coco_gt.loadRes(temp_results_file)

        # COCO evaluation
        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
        coco_eval.evaluate()
        coco_eval.accumulate()

        # Suppress print output from summarize
        import sys
        from io import StringIO
        old_stdout = sys.stdout
        sys.stdout = StringIO()
        coco_eval.summarize()
        sys.stdout = old_stdout

        # Extract metrics
        metrics = {
            'mAP_50_95': float(coco_eval.stats[0]) if coco_eval.stats[0] != -1 else 0.0,
            'mAP_50': float(coco_eval.stats[1]) if coco_eval.stats[1] != -1 else 0.0,
            'mAP_75': float(coco_eval.stats[2]) if coco_eval.stats[2] != -1 else 0.0,
            'mAP_small': float(coco_eval.stats[3]) if coco_eval.stats[3] != -1 else 0.0,
            'mAP_medium': float(coco_eval.stats[4]) if coco_eval.stats[4] != -1 else 0.0,
            'mAP_large': float(coco_eval.stats[5]) if coco_eval.stats[5] != -1 else 0.0,
            'AR_1': float(coco_eval.stats[6]) if coco_eval.stats[6] != -1 else 0.0,
            'AR_10': float(coco_eval.stats[7]) if coco_eval.stats[7] != -1 else 0.0,
            'AR_100': float(coco_eval.stats[8]) if coco_eval.stats[8] != -1 else 0.0,
            'AR_small': float(coco_eval.stats[9]) if coco_eval.stats[9] != -1 else 0.0,
            'AR_medium': float(coco_eval.stats[10]) if coco_eval.stats[10] != -1 else 0.0,
            'AR_large': float(coco_eval.stats[11]) if coco_eval.stats[11] != -1 else 0.0,
            'num_detections': len(coco_results),
            'valid_detections': len(filtered_results)
        }

        # Clean up
        os.remove(temp_results_file)

        print(f"COCO metrics for {model_name}: mAP@0.5={metrics['mAP_50']:.3f}, mAP@0.5:0.95={metrics['mAP_50_95']:.3f}")
        return metrics

    except Exception as e:
        print(f"Error calculating COCO metrics for {model_name}: {e}")
        return create_empty_metrics()

def create_empty_metrics():
    """Create empty metrics dictionary"""
    return {
        'mAP_50': 0.0, 'mAP_75': 0.0, 'mAP_50_95': 0.0,
        'mAP_small': 0.0, 'mAP_medium': 0.0, 'mAP_large': 0.0,
        'AR_1': 0.0, 'AR_10': 0.0, 'AR_100': 0.0,
        'AR_small': 0.0, 'AR_medium': 0.0, 'AR_large': 0.0,
        'num_detections': 0, 'valid_detections': 0
    }

def calculate_timing_metrics(timing_info):
    """Calculate timing and performance metrics"""
    if not timing_info:
        return {'avg_inference_time': 0, 'fps': 0, 'total_time': 0}

    times = [t['inference_time'] for t in timing_info]
    return {
        'avg_inference_time': np.mean(times),
        'fps': 1.0 / np.mean(times),
        'total_time': np.sum(times),
        'min_time': np.min(times),
        'max_time': np.max(times),
        'std_time': np.std(times)
    }

# Calculate metrics for all models
all_metrics = {}
for model_name, results in model_results.items():
    print(f"\nCalculating metrics for {model_name}...")

    detection_metrics = calculate_metrics(
        results['coco_results'], coco_gt, model_name
    )
    timing_metrics = calculate_timing_metrics(results['timing_info'])

    all_metrics[model_name] = {
        **detection_metrics,
        **timing_metrics
    }

    # Save individual metrics
    if CONFIG['save_metrics']:
        metrics_file = base_output_path / 'metrics' / f'{model_name}_metrics.json'
        with open(metrics_file, 'w') as f:
            json.dump(all_metrics[model_name], f, indent=2)

print("\nMetrics calculation completed")

Starting evaluation on 6418 images...

Evaluating model 1/6: yolo8n_pt_512_128_orig_full.pt
Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


                                                                                           

  Completed: 4928 detections, 0.217 precision, 0.097 recall

Evaluating model 2/6: yolo8n_pt_512_64_sama_full.pt


                                                                                          

  Completed: 4913 detections, 0.219 precision, 0.098 recall

Evaluating model 3/6: yolo8n_pt_512_16_sama_full.pt


                                                                                          

  Completed: 4848 detections, 0.216 precision, 0.095 recall

Evaluating model 4/6: yolo8n_yaml_512_sama_full.pt


                                                                                         

  Completed: 4191 detections, 0.215 precision, 0.082 recall

Evaluating model 5/6: yolo8n_pt_512_16_orig_full.pt


                                                                                          

  Completed: 4822 detections, 0.217 precision, 0.096 recall

Evaluating model 6/6: yolo8n_yaml_512_orig_full.pt


                                                                                         

  Completed: 4150 detections, 0.217 precision, 0.082 recall


In [15]:
plt.style.use('default')
sns.set_palette("husl")

def create_comprehensive_performance_plots():
    """Create comprehensive performance comparison visualizations"""

    # Prepare data for plotting
    models = list(all_metrics.keys())
    map_50 = [all_metrics[m]['mAP_50'] for m in models]
    map_75 = [all_metrics[m]['mAP_75'] for m in models]
    map_50_95 = [all_metrics[m]['mAP_50_95'] for m in models]
    fps = [all_metrics[m]['fps'] for m in models]
    avg_time = [all_metrics[m]['avg_inference_time'] for m in models]
    num_detections = [all_metrics[m]['num_detections'] for m in models]

    # Create comprehensive subplot layout
    fig, axes = plt.subplots(3, 2, figsize=(18, 20))

    # 1. mAP Comparison
    x = np.arange(len(models))
    width = 0.25
    axes[0, 0].bar(x - width, map_50, width, label='mAP@0.5', alpha=0.8)
    axes[0, 0].bar(x, map_75, width, label='mAP@0.75', alpha=0.8)
    axes[0, 0].bar(x + width, map_50_95, width, label='mAP@0.5:0.95', alpha=0.8)
    axes[0, 0].set_xlabel('Models')
    axes[0, 0].set_ylabel('mAP Score')
    axes[0, 0].set_title('mAP Comparison Across IoU Thresholds')
    axes[0, 0].set_xticks(x)
    axes[0, 0].set_xticklabels([m.replace('_', '\n') for m in models], rotation=0, fontsize=8)
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)

    # Add value labels on bars
    for i, (m50, m75, m50_95) in enumerate(zip(map_50, map_75, map_50_95)):
        axes[0, 0].text(i - width, m50 + 0.01, f'{m50:.3f}', ha='center', fontsize=8)
        axes[0, 0].text(i, m75 + 0.01, f'{m75:.3f}', ha='center', fontsize=8)
        axes[0, 0].text(i + width, m50_95 + 0.01, f'{m50_95:.3f}', ha='center', fontsize=8)

    # 2. FPS Comparison
    bars = axes[0, 1].bar(models, fps, color='green', alpha=0.7)
    axes[0, 1].set_xlabel('Models')
    axes[0, 1].set_ylabel('FPS')
    axes[0, 1].set_title('Inference Speed (FPS)')
    axes[0, 1].tick_params(axis='x', rotation=45)
    axes[0, 1].grid(True, alpha=0.3)

    # Add value labels
    for bar, f in zip(bars, fps):
        height = bar.get_height()
        axes[0, 1].text(bar.get_x() + bar.get_width()/2., height + 0.5,
                       f'{f:.1f}', ha='center', va='bottom')

    # 3. Speed vs Accuracy Trade-off
    scatter = axes[1, 0].scatter(fps, map_50, s=200, alpha=0.7, c=map_50_95, cmap='viridis')
    for i, model in enumerate(models):
        axes[1, 0].annotate(model.replace('_', '\n'), (fps[i], map_50[i]),
                           xytext=(5, 5), textcoords='offset points', fontsize=8)
    axes[1, 0].set_xlabel('FPS')
    axes[1, 0].set_ylabel('mAP@0.5')
    axes[1, 0].set_title('Speed vs Accuracy Trade-off\n(Color represents mAP@0.5:0.95)')
    axes[1, 0].grid(True, alpha=0.3)
    plt.colorbar(scatter, ax=axes[1, 0], label='mAP@0.5:0.95')

    # 4. Detection Count Comparison
    bars = axes[1, 1].bar(models, num_detections, color='orange', alpha=0.7)
    axes[1, 1].set_xlabel('Models')
    axes[1, 1].set_ylabel('Number of Detections')
    axes[1, 1].set_title('Total Detections per Model')
    axes[1, 1].tick_params(axis='x', rotation=45)
    axes[1, 1].grid(True, alpha=0.3)

    # Add value labels
    for bar, count in zip(bars, num_detections):
        height = bar.get_height()
        axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 50,
                       f'{count}', ha='center', va='bottom')

    # 5. Inference Time Distribution
    axes[2, 0].bar(models, avg_time, color='purple', alpha=0.7)
    axes[2, 0].set_xlabel('Models')
    axes[2, 0].set_ylabel('Average Inference Time (s)')
    axes[2, 0].set_title('Average Inference Time per Image')
    axes[2, 0].tick_params(axis='x', rotation=45)
    axes[2, 0].grid(True, alpha=0.3)

    # 6. Radar Chart for Multi-metric Comparison
    if len(models) <= 6:  # Only if not too many models
        categories = ['mAP@0.5', 'mAP@0.75', 'mAP@0.5:0.95', 'FPS\n(normalized)', 'Precision\n(mAP@0.5)']

        # Normalize FPS for radar chart
        max_fps = max(fps)
        normalized_fps = [f/max_fps for f in fps]

        angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist()
        angles += angles[:1]  # Complete the circle

        ax_radar = plt.subplot(3, 2, 6, projection='polar')

        colors = plt.cm.tab10(np.linspace(0, 1, len(models)))

        for i, model in enumerate(models):
            values = [map_50[i], map_75[i], map_50_95[i], normalized_fps[i], map_50[i]]
            values += values[:1]  # Complete the circle

            ax_radar.plot(angles, values, 'o-', linewidth=2, label=model, color=colors[i])
            ax_radar.fill(angles, values, alpha=0.1, color=colors[i])

        ax_radar.set_xticks(angles[:-1])
        ax_radar.set_xticklabels(categories)
        ax_radar.set_ylim(0, 1)
        ax_radar.set_title('Multi-metric Model Comparison\n(Radar Chart)')
        ax_radar.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    else:
        axes[2, 1].text(0.5, 0.5, 'Too many models\nfor radar chart',
                       transform=axes[2, 1].transAxes, ha='center', va='center', fontsize=12)
        axes[2, 1].set_title('Radar Chart (Skipped)')

    plt.tight_layout()
    plt.savefig(base_output_path / 'visualizations' / 'comprehensive_performance.png',
                dpi=CONFIG['dpi'], bbox_inches='tight')
    plt.close()

def create_detailed_analysis_plots():
    """Create detailed analysis plots"""

    # Efficiency Analysis
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    models = list(all_metrics.keys())
    map_50 = [all_metrics[m]['mAP_50'] for m in models]
    fps = [all_metrics[m]['fps'] for m in models]

    # 1. Efficiency Score (mAP * FPS)
    efficiency = [m * f for m, f in zip(map_50, fps)]
    bars = axes[0, 0].bar(models, efficiency, color='teal', alpha=0.7)
    axes[0, 0].set_xlabel('Models')
    axes[0, 0].set_ylabel('Efficiency Score (mAP@0.5 × FPS)')
    axes[0, 0].set_title('Model Efficiency Analysis')
    axes[0, 0].tick_params(axis='x', rotation=45)
    axes[0, 0].grid(True, alpha=0.3)

    # Add value labels
    for bar, eff in zip(bars, efficiency):
        height = bar.get_height()
        axes[0, 0].text(bar.get_x() + bar.get_width()/2., height + 0.5,
                       f'{eff:.2f}', ha='center', va='bottom')

    # 2. Precision vs Recall (using mAP as approximation)
    ar_100 = [all_metrics[m]['AR_100'] for m in models]
    axes[0, 1].scatter(ar_100, map_50, s=150, alpha=0.7, c=fps, cmap='plasma')
    for i, model in enumerate(models):
        axes[0, 1].annotate(model.replace('_', '\n'), (ar_100[i], map_50[i]),
                           xytext=(5, 5), textcoords='offset points', fontsize=8)
    axes[0, 1].set_xlabel('Average Recall (AR@100)')
    axes[0, 1].set_ylabel('mAP@0.5')
    axes[0, 1].set_title('Precision vs Recall Analysis')
    axes[0, 1].grid(True, alpha=0.3)

    # 3. Model Ranking by different metrics
    ranking_metrics = ['mAP_50', 'mAP_75', 'mAP_50_95', 'fps']
    ranking_data = []

    for metric in ranking_metrics:
        values = [all_metrics[m][metric] for m in models]
        ranks = np.argsort(np.argsort(values)[::-1]) + 1  # Ranking (1 is best)
        ranking_data.append(ranks)

    ranking_df = pd.DataFrame(ranking_data, index=ranking_metrics, columns=models)

    im = axes[1, 0].imshow(ranking_df.values, cmap='RdYlGn_r', aspect='auto')
    axes[1, 0].set_xticks(range(len(models)))
    axes[1, 0].set_xticklabels([m.replace('_', '\n') for m in models], rotation=45)
    axes[1, 0].set_yticks(range(len(ranking_metrics)))
    axes[1, 0].set_yticklabels(ranking_metrics)
    axes[1, 0].set_title('Model Ranking Heatmap\n(1=Best, Higher=Worse)')

    # Add ranking numbers
    for i in range(len(ranking_metrics)):
        for j in range(len(models)):
            axes[1, 0].text(j, i, f'{ranking_df.iloc[i, j]:.0f}',
                           ha='center', va='center', fontweight='bold')

    plt.colorbar(im, ax=axes[1, 0])

    # 4. Performance Distribution
    all_map_50 = [all_metrics[m]['mAP_50'] for m in models]
    all_fps = [all_metrics[m]['fps'] for m in models]

    axes[1, 1].hist(all_map_50, bins=5, alpha=0.7, label='mAP@0.5', color='blue')
    axes[1, 1].set_xlabel('mAP@0.5')
    axes[1, 1].set_ylabel('Number of Models')
    axes[1, 1].set_title('mAP@0.5 Distribution')
    axes[1, 1].grid(True, alpha=0.3)

    # Add secondary y-axis for FPS
    ax2 = axes[1, 1].twinx()
    ax2.hist(all_fps, bins=5, alpha=0.7, label='FPS', color='red')
    ax2.set_ylabel('Number of Models (FPS)', color='red')

    plt.tight_layout()
    plt.savefig(base_output_path / 'visualizations' / 'detailed_analysis.png',
                dpi=CONFIG['dpi'], bbox_inches='tight')
    plt.close()

# Generate enhanced visualizations
print("Creating comprehensive visualizations...")
create_comprehensive_performance_plots()
create_detailed_analysis_plots()

Results saved to evaluation_results/metrics.json


In [None]:
def get_ground_truth_for_image(coco_gt, img_id):
    """Get ground truth annotations for a specific image"""
    ann_ids = coco_gt.getAnnIds(imgIds=[img_id])
    anns = coco_gt.loadAnns(ann_ids)

    gt_boxes = []
    gt_labels = []

    for ann in anns:
        # COCO bbox format: [x, y, width, height]
        x, y, w, h = ann['bbox']
        gt_boxes.append([x, y, x+w, y+h])  # Convert to [x1, y1, x2, y2]

        # Get category name
        cat_info = coco_gt.loadCats([ann['category_id']])[0]
        gt_labels.append(cat_info['name'])

    return gt_boxes, gt_labels

def create_sample_detections():
    """Create sample images with detections and ground truth"""

    # Create image ID mapping for samples
    filename_to_id, id_to_filename = create_image_id_mapping(coco_gt, CONFIG['images_path'])

    # Get sample images
    available_images = list(filename_to_id.keys())[:CONFIG['num_sample_images']]

    for img_idx, img_filename in enumerate(available_images):
        print(f"Processing sample image {img_idx + 1}/{len(available_images)}: {img_filename}")

        img_file = Path(CONFIG['images_path']) / img_filename
        img_id = filename_to_id[img_filename]

        # Load image
        img = cv2.imread(str(img_file))
        if img is None:
            print(f"Could not load image: {img_file}")
            continue

        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Get ground truth
        gt_boxes, gt_labels = get_ground_truth_for_image(coco_gt, img_id)

        # Create subplot for GT + each model
        num_models = len(model_files)
        total_plots = num_models + 1  # +1 for ground truth
        cols = min(3, total_plots)
        rows = (total_plots + cols - 1) // cols

        fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
        if total_plots == 1:
            axes = [axes]
        elif rows == 1:
            axes = axes.flatten()
        else:
            axes = axes.flatten()

        # Plot ground truth first
        ax = axes[0]
        ax.imshow(img_rgb)

        for i, (gt_box, gt_label) in enumerate(zip(gt_boxes, gt_labels)):
            x1, y1, x2, y2 = gt_box
            rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                               fill=False, color='green', linewidth=2)
            ax.add_patch(rect)
            ax.text(x1, y1-5, gt_label,
                   bbox=dict(boxstyle="round,pad=0.3", facecolor="green", alpha=0.7),
                   fontsize=8, color='white')

        ax.set_title(f'Ground Truth\n({len(gt_boxes)} annotations)')
        ax.axis('off')

        # Plot model predictions
        for model_idx, model_path in enumerate(model_files):
            model_name = Path(model_path).stem

            try:
                # Run inference
                model = YOLO(model_path)
                results = model(str(img_file), conf=CONFIG['confidence_threshold'],
                               verbose=False, save=False)

                # Plot predictions
                ax = axes[model_idx + 1]
                ax.imshow(img_rgb)

                num_detections = 0
                if results[0].boxes is not None:
                    boxes = results[0].boxes.xyxy.cpu().numpy()
                    scores = results[0].boxes.conf.cpu().numpy()
                    classes = results[0].boxes.cls.cpu().numpy()
                    num_detections = len(boxes)

                    for i in range(len(boxes)):
                        x1, y1, x2, y2 = boxes[i]
                        score = scores[i]
                        yolo_class = int(classes[i])

                        # Get class name
                        if yolo_class in category_mapping:
                            coco_cat_id = category_mapping[yolo_class]
                            cat_info = coco_gt.loadCats([coco_cat_id])[0]
                            class_name = cat_info['name']
                        else:
                            class_name = f"class_{yolo_class}"

                        # Draw bounding box
                        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                                           fill=False, color='red', linewidth=2)
                        ax.add_patch(rect)

                        # Add label
                        ax.text(x1, y1-5, f'{class_name} {score:.2f}',
                               bbox=dict(boxstyle="round,pad=0.3", facecolor="red", alpha=0.7),
                               fontsize=8, color='white')

                ax.set_title(f'{model_name}\n({num_detections} detections)')
                ax.axis('off')

            except Exception as e:
                print(f"Error processing {model_name}: {e}")
                ax = axes[model_idx + 1]
                ax.text(0.5, 0.5, f'Error: {model_name}',
                       transform=ax.transAxes, ha='center', va='center')
                ax.axis('off')

        # Hide unused subplots
        for i in range(total_plots, len(axes)):
            axes[i].axis('off')

        plt.tight_layout()
        plt.savefig(base_output_path / 'samples' / f'sample_{img_idx + 1}_{img_filename}',
                   dpi=CONFIG['dpi'], bbox_inches='tight')
        plt.close()

        print(f"  Saved sample with {len(gt_boxes)} GT annotations")

    print(f"Generated {len(available_images)} sample detection images")

# Generate sample detections
print("\nCreating sample detections with ground truth...")
create_sample_detections()

In [None]:
def generate_markdown_report():
    """Generate comprehensive and well-formatted markdown report"""

    # Calculate additional statistics
    total_gt_annotations = len(gt_data['annotations'])
    total_images = len(gt_data['images'])
    total_categories = len(gt_data['categories'])

    # Get category statistics
    category_stats = {}
    for ann in gt_data['annotations']:
        cat_id = ann['category_id']
        if cat_id not in category_stats:
            cat_name = next(cat['name'] for cat in gt_data['categories'] if cat['id'] == cat_id)
            category_stats[cat_id] = {'name': cat_name, 'count': 0}
        category_stats[cat_id]['count'] += 1

    # Model performance rankings
    models_by_map50 = sorted(all_metrics.items(), key=lambda x: x[1]['mAP_50'], reverse=True)
    models_by_speed = sorted(all_metrics.items(), key=lambda x: x[1]['fps'], reverse=True)
    models_by_efficiency = sorted(all_metrics.items(), key=lambda x: x[1]['mAP_50'] * x[1]['fps'], reverse=True)

    report_content = f"""# 🎯 YOLO Model Evaluation Report

## 📋 Experiment Overview

| **Parameter** | **Value** |
|---------------|-----------|
| **Experiment Name** | `{CONFIG['experiment_name']}` |
| **Date & Time** | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} |
| **Dataset** | Pascal VOC Test Dataset |
| **Models Tested** | {len(model_results)} |
| **Test Images** | {total_images} |
| **Ground Truth Annotations** | {total_gt_annotations} |
| **Object Categories** | {total_categories} |

## ⚙️ Configuration Settings

| **Setting** | **Value** |
|-------------|-----------|
| **Confidence Threshold** | {CONFIG['confidence_threshold']} |
| **IoU Threshold** | {CONFIG['iou_threshold']} |
| **Max Detections** | {CONFIG['max_detections']} |
| **Image Size** | {CONFIG['image_size']} |
| **Device** | {CONFIG['device']} |

## 📊 Dataset Statistics

### Category Distribution
| **Category** | **Annotations** | **Percentage** |
|--------------|-----------------|----------------|
"""

    # Add category statistics
    for cat_id, info in sorted(category_stats.items(), key=lambda x: x[1]['count'], reverse=True):
        percentage = (info['count'] / total_gt_annotations) * 100
        report_content += f"| {info['name']} | {info['count']} | {percentage:.1f}% |\n"

    report_content += f"""
## 🏆 Model Performance Comparison

### Overall Performance Table
| **Model** | **mAP@0.5** | **mAP@0.75** | **mAP@0.5:0.95** | **FPS** | **Avg Time (s)** | **Detections** | **Efficiency** |
|-----------|-------------|--------------|------------------|---------|------------------|----------------|-----------------|
"""

    # Add model performance rows
    for model_name in sorted(all_metrics.keys()):
        metrics = all_metrics[model_name]
        efficiency = metrics['mAP_50'] * metrics['fps']
        report_content += f"| {model_name} | {metrics['mAP_50']:.3f} | {metrics['mAP_75']:.3f} | {metrics['mAP_50_95']:.3f} | {metrics['fps']:.1f} | {metrics['avg_inference_time']:.3f} | {metrics['num_detections']} | {efficiency:.2f} |\n"

    # Performance rankings
    report_content += f"""
### 🥇 Performance Rankings

#### By Accuracy (mAP@0.5)
"""
    for i, (model_name, metrics) in enumerate(models_by_map50[:5]):
        medal = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"{i+1}."
        report_content += f"{medal} **{model_name}** - {metrics['mAP_50']:.3f}\n"

    report_content += f"""
#### By Speed (FPS)
"""
    for i, (model_name, metrics) in enumerate(models_by_speed[:5]):
        medal = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"{i+1}."
        report_content += f"{medal} **{model_name}** - {metrics['fps']:.1f} FPS\n"

    report_content += f"""
#### By Efficiency (mAP@0.5 × FPS)
"""
    for i, (model_name, metrics) in enumerate(models_by_efficiency[:5]):
        medal = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"{i+1}."
        efficiency = metrics['mAP_50'] * metrics['fps']
        report_content += f"{medal} **{model_name}** - {efficiency:.2f}\n"

    # Detailed analysis
    best_accuracy = max(all_metrics.items(), key=lambda x: x[1]['mAP_50'])
    fastest_model = max(all_metrics.items(), key=lambda x: x[1]['fps'])
    most_detections = max(all_metrics.items(), key=lambda x: x[1]['num_detections'])
    best_efficiency = max(all_metrics.items(), key=lambda x: x[1]['mAP_50'] * x[1]['fps'])

    report_content += f"""
## 📈 Detailed Performance Analysis

### Key Findings
- **🎯 Accuracy Champion**: {best_accuracy[0]} achieved the highest mAP@0.5 of **{best_accuracy[1]['mAP_50']:.3f}**
- **⚡ Speed Champion**: {fastest_model[0]} processes images at **{fastest_model[1]['fps']:.1f} FPS**
- **🔍 Detection Champion**: {most_detections[0]} generated **{most_detections[1]['num_detections']} detections**
- **⚖️ Efficiency Champion**: {best_efficiency[0]} with efficiency score of **{best_efficiency[1]['mAP_50'] * best_efficiency[1]['fps']:.2f}**

### Model-Specific Analysis
"""

    # Add detailed analysis for each model
    for model_name, metrics in all_metrics.items():
        efficiency = metrics['mAP_50'] * metrics['fps']
        report_content += f"""
#### {model_name}
- **Accuracy**: mAP@0.5 = {metrics['mAP_50']:.3f} | mAP@0.75 = {metrics['mAP_75']:.3f} | mAP@0.5:0.95 = {metrics['mAP_50_95']:.3f}
- **Speed**: {metrics['fps']:.1f} FPS ({metrics['avg_inference_time']:.3f}s per image)
- **Detections**: {metrics['num_detections']} total ({metrics['valid_detections']} valid)
- **Efficiency Score**: {efficiency:.2f}
"""

    report_content += f"""
## 📁 Generated Files

### 📊 Visualizations
- `visualizations/comprehensive_performance.png` - Complete performance comparison
- `visualizations/detailed_analysis.png` - In-depth analysis charts
- `samples/` - Sample detection visualizations ({len(list((base_output_path / 'samples').glob('*.png')))} files)

### 📄 Data Files
- `coco_results/` - COCO format results ({len(list((base_output_path / 'coco_results').glob('*.json')))} files)
- `metrics/` - Detailed metrics JSON files ({len(list((base_output_path / 'metrics').glob('*.json')))} files)

## 🎯 Recommendations

### For Different Use Cases:

#### 🎯 High Accuracy Applications
- **Recommended**: {best_accuracy[0]}
- **Reason**: Highest mAP@0.5 of {best_accuracy[1]['mAP_50']:.3f}
- **Use Cases**: Medical imaging, security systems, quality control

#### ⚡ Real-Time Applications
- **Recommended**: {fastest_model[0]}
- **Reason**: Fastest processing at {fastest_model[1]['fps']:.1f} FPS
- **Use Cases**: Video surveillance, autonomous vehicles, live streaming

#### ⚖️ Balanced Performance
- **Recommended**: {best_efficiency[0]}
- **Reason**: Best efficiency score of {best_efficiency[1]['mAP_50'] * best_efficiency[1]['fps']:.2f}
- **Use Cases**: General object detection, mobile applications

### Performance Trade-offs
- **Accuracy vs Speed**: There's a {(max(m['mAP_50'] for m in all_metrics.values()) - min(m['mAP_50'] for m in all_metrics.values())):.3f} difference in mAP@0.5 between best and worst models
- **Speed Range**: FPS ranges from {min(m['fps'] for m in all_metrics.values()):.1f} to {max(m['fps'] for m in all_metrics.values()):.1f}
- **Detection Volume**: Models generate between {min(m['num_detections'] for m in all_metrics.values())} and {max(m['num_detections'] for m in all_metrics.values())} detections

## 📝 Technical Notes

### COCO Evaluation Metrics
- **mAP@0.5**: Mean Average Precision at IoU threshold 0.5
- **mAP@0.75**: Mean Average Precision at IoU threshold 0.75
- **mAP@0.5:0.95**: Mean Average Precision averaged over IoU thresholds 0.5 to 0.95
- **AR**: Average Recall at different detection limits

### Validation Status
- Image ID mapping: ✅ Validated
- Category mapping: ✅ Validated
- COCO format compliance: ✅ Validated
- Detection format: ✅ Validated

---
*📊 Report generated automatically by YOLO Model Evaluation System*
*🕐 Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
*📧 For questions or issues, check the validation logs in the evaluation output*
"""

    # Save report
    with open(base_output_path / 'evaluation_report.md', 'w') as f:
        f.write(report_content)

    print("📄 Enhanced markdown report generated successfully")
    print(f"📍 Location: {base_output_path / 'evaluation_report.md'}")

# Generate report
generate_markdown_report()

In [18]:
# ============================================================================
# 🔍 DATA VALIDATION AND INTEGRITY CHECKING BLOCK
# ============================================================================

def validate_image_id_mapping():
    """Validate that image IDs match between YOLO results and COCO ground truth"""
    print("🔍 Validating image ID mapping...")

    # Get all image IDs from ground truth
    gt_image_ids = set(coco_gt.getImgIds())

    # Get all image IDs from predictions
    pred_image_ids = set()
    for model_name, results in model_results.items():
        for detection in results['coco_results']:
            pred_image_ids.add(detection['image_id'])

    # Check overlap
    common_ids = gt_image_ids.intersection(pred_image_ids)
    missing_in_pred = gt_image_ids - pred_image_ids
    extra_in_pred = pred_image_ids - gt_image_ids

    print(f"  ✅ Ground truth images: {len(gt_image_ids)}")
    print(f"  ✅ Prediction images: {len(pred_image_ids)}")
    print(f"  ✅ Common images: {len(common_ids)}")

    if missing_in_pred:
        print(f"  ⚠️  Missing in predictions: {len(missing_in_pred)} images")
        print(f"      Example missing IDs: {list(missing_in_pred)[:5]}")

    if extra_in_pred:
        print(f"  ⚠️  Extra in predictions: {len(extra_in_pred)} images")
        print(f"      Example extra IDs: {list(extra_in_pred)[:5]}")

    # Validate image files exist
    images_path = Path(CONFIG['images_path'])
    existing_files = len(list(images_path.glob('*.jpg')) + list(images_path.glob('*.png')))
    print(f"  ✅ Image files on disk: {existing_files}")

    return {
        'gt_image_ids': len(gt_image_ids),
        'pred_image_ids': len(pred_image_ids),
        'common_ids': len(common_ids),
        'missing_in_pred': len(missing_in_pred),
        'extra_in_pred': len(extra_in_pred),
        'files_on_disk': existing_files
    }

def validate_category_mapping():
    """Validate category mapping between YOLO classes and COCO categories"""
    print("🔍 Validating category mapping...")

    # Get all categories from ground truth
    gt_categories = {cat['id']: cat['name'] for cat in gt_data['categories']}

    # Get all categories from predictions
    pred_categories = set()
    for model_name, results in model_results.items():
        for detection in results['coco_results']:
            pred_categories.add(detection['category_id'])

    # Check category validity
    valid_categories = set(gt_categories.keys())
    invalid_categories = pred_categories - valid_categories

    print(f"  ✅ Ground truth categories: {len(gt_categories)}")
    print(f"  ✅ Categories in predictions: {len(pred_categories)}")
    print(f"  ✅ Valid prediction categories: {len(pred_categories.intersection(valid_categories))}")

    if invalid_categories:
        print(f"  ❌ Invalid categories in predictions: {invalid_categories}")

    # Print category mapping
    print("  📋 Category mapping:")
    for yolo_class, coco_cat in category_mapping.items():
        cat_name = gt_categories.get(coco_cat, 'Unknown')
        print(f"      YOLO {yolo_class} -> COCO {coco_cat} ({cat_name})")

    return {
        'gt_categories': len(gt_categories),
        'pred_categories': len(pred_categories),
        'valid_pred_categories': len(pred_categories.intersection(valid_categories)),
        'invalid_categories': len(invalid_categories)
    }

def validate_detection_format():
    """Validate detection format and bbox coordinates"""
    print("🔍 Validating detection format...")

    total_detections = 0
    valid_detections = 0
    invalid_bbox_count = 0
    negative_coords_count = 0

    for model_name, results in model_results.items():
        model_detections = len(results['coco_results'])
        model_valid = 0

        for detection in results['coco_results']:
            total_detections += 1

            # Check required fields
            required_fields = ['image_id', 'category_id', 'bbox', 'score']
            if all(field in detection for field in required_fields):
                # Check bbox format [x, y, width, height]
                bbox = detection['bbox']
                if (len(bbox) == 4 and
                    bbox[2] > 0 and bbox[3] > 0 and  # positive width/height
                    bbox[0] >= 0 and bbox[1] >= 0):  # non-negative coordinates
                    valid_detections += 1
                    model_valid += 1
                else:
                    invalid_bbox_count += 1
                    if bbox[0] < 0 or bbox[1] < 0:
                        negative_coords_count += 1

        print(f"  📊 {model_name}: {model_valid}/{model_detections} valid detections")

    print(f"  ✅ Total detections: {total_detections}")
    print(f"  ✅ Valid detections: {valid_detections}")
    print(f"  ❌ Invalid bbox format: {invalid_bbox_count}")
    print(f"  ❌ Negative coordinates: {negative_coords_count}")

    return {
        'total_detections': total_detections,
        'valid_detections': valid_detections,
        'invalid_bbox_count': invalid_bbox_count,
        'negative_coords_count': negative_coords_count
    }

def validate_coco_format():
    """Validate COCO format compliance"""
    print("🔍 Validating COCO format compliance...")

    # Check ground truth structure
    required_gt_fields = ['images', 'annotations', 'categories']
    missing_gt_fields = [field for field in required_gt_fields if field not in gt_data]

    if missing_gt_fields:
        print(f"  ❌ Missing GT fields: {missing_gt_fields}")
    else:
        print(f"  ✅ Ground truth structure: Valid")

    # Check annotations structure
    if gt_data['annotations']:
        sample_ann = gt_data['annotations'][0]
        required_ann_fields = ['id', 'image_id', 'category_id', 'bbox', 'area']
        missing_ann_fields = [field for field in required_ann_fields if field not in sample_ann]

        if missing_ann_fields:
            print(f"  ❌ Missing annotation fields: {missing_ann_fields}")
        else:
            print(f"  ✅ Annotation structure: Valid")

    # Check categories structure
    if gt_data['categories']:
        sample_cat = gt_data['categories'][0]
        required_cat_fields = ['id', 'name', 'supercategory']
        missing_cat_fields = [field for field in required_cat_fields if field not in sample_cat]

        if missing_cat_fields:
            print(f"  ❌ Missing category fields: {missing_cat_fields}")
        else:
            print(f"  ✅ Category structure: Valid")

    return {
        'gt_structure_valid': len(missing_gt_fields) == 0,
        'annotation_structure_valid': len(missing_ann_fields) == 0 if gt_data['annotations'] else True,
        'category_structure_valid': len(missing_cat_fields) == 0 if gt_data['categories'] else True
    }

def validate_model_outputs():
    """Validate model output consistency"""
    print("🔍 Validating model output consistency...")

    for model_name, results in model_results.items():
        print(f"  📊 {model_name}:")

        # Check timing info
        timing_info = results['timing_info']
        coco_results = results['coco_results']

        # Count detections per image
        detections_per_image = {}
        for detection in coco_results:
            img_id = detection['image_id']
            detections_per_image[img_id] = detections_per_image.get(img_id, 0) + 1

        # Statistics
        if detections_per_image:
            avg_detections = np.mean(list(detections_per_image.values()))
            max_detections = max(detections_per_image.values())
            min_detections = min(detections_per_image.values())

            print(f"      - Images processed: {len(timing_info)}")
            print(f"      - Images with detections: {len(detections_per_image)}")
            print(f"      - Avg detections/image: {avg_detections:.1f}")
            print(f"      - Max detections/image: {max_detections}")
            print(f"      - Min detections/image: {min_detections}")

            # Check for outliers
            outlier_images = [img_id for img_id, count in detections_per_image.items()
                            if count > CONFIG['max_detections'] * 0.8]
            if outlier_images:
                print(f"      - ⚠️  High detection count images: {len(outlier_images)}")

        # Timing validation
        if timing_info:
            times = [t['inference_time'] for t in timing_info]
            avg_time = np.mean(times)
            std_time = np.std(times)

            print(f"      - Avg inference time: {avg_time:.3f}s (±{std_time:.3f}s)")

            # Check for timing outliers
            outlier_threshold = avg_time + 2 * std_time
            outliers = [t for t in times if t > outlier_threshold]
            if outliers:
                print(f"      - ⚠️  Timing outliers: {len(outliers)} images")

def check_metrics_consistency():
    """Check consistency between calculated metrics"""
    print("🔍 Checking metrics consistency...")

    for model_name, metrics in all_metrics.items():
        print(f"  📊 {model_name}:")

        # Check mAP hierarchy (should be mAP@0.5 >= mAP@0.75 >= mAP@0.5:0.95)
        map_50 = metrics['mAP_50']
        map_75 = metrics['mAP_75']
        map_50_95 = metrics['mAP_50_95']

        hierarchy_ok = map_50 >= map_75 >= map_50_95
        print(f"      - mAP hierarchy: {'✅' if hierarchy_ok else '❌'} ({map_50:.3f} >= {map_75:.3f} >= {map_50_95:.3f})")

        # Check detection counts
        total_dets = metrics['num_detections']
        valid_dets = metrics['valid_detections']

        if total_dets >= valid_dets:
            print(f"      - Detection counts: ✅ ({valid_dets}/{total_dets} valid)")
        else:
            print(f"      - Detection counts: ❌ ({valid_dets}/{total_dets} - more valid than total!)")

        # Check timing consistency
        avg_time = metrics['avg_inference_time']
        fps = metrics['fps']
        expected_fps = 1.0 / avg_time if avg_time > 0 else 0

        fps_diff = abs(fps - expected_fps)
        fps_ok = fps_diff < 0.1  # Allow small numerical differences

        print(f"      - FPS consistency: {'✅' if fps_ok else '❌'} (calculated: {expected_fps:.1f}, reported: {fps:.1f})")

def generate_validation_report():
    """Generate validation summary report"""
    print("📄 Generating validation report...")

    # Run all validations
    image_validation = validate_image_id_mapping()
    category_validation = validate_category_mapping()
    detection_validation = validate_detection_format()
    coco_validation = validate_coco_format()

    # Create validation report
    validation_report = f"""# 🔍 Validation Report

## Image ID Mapping
- Ground truth images: {image_validation['gt_image_ids']}
- Prediction images: {image_validation['pred_image_ids']}
- Common images: {image_validation['common_ids']}
- Missing in predictions: {image_validation['missing_in_pred']}
- Extra in predictions: {image_validation['extra_in_pred']}
- Files on disk: {image_validation['files_on_disk']}

## Category Mapping
- Ground truth categories: {category_validation['gt_categories']}
- Prediction categories: {category_validation['pred_categories']}
- Valid prediction categories: {category_validation['valid_pred_categories']}
- Invalid categories: {category_validation['invalid_categories']}

## Detection Format
- Total detections: {detection_validation['total_detections']}
- Valid detections: {detection_validation['valid_detections']}
- Invalid bbox format: {detection_validation['invalid_bbox_count']}
- Negative coordinates: {detection_validation['negative_coords_count']}

## COCO Format Compliance
- Ground truth structure: {'✅' if coco_validation['gt_structure_valid'] else '❌'}
- Annotation structure: {'✅' if coco_validation['annotation_structure_valid'] else '❌'}
- Category structure: {'✅' if coco_validation['category_structure_valid'] else '❌'}

## Overall Status
{'✅ All validations passed' if all([
    image_validation['missing_in_pred'] == 0,
    category_validation['invalid_categories'] == 0,
    detection_validation['invalid_bbox_count'] == 0,
    coco_validation['gt_structure_valid'],
    coco_validation['annotation_structure_valid'],
    coco_validation['category_structure_valid']
]) else '⚠️ Some validations failed - check details above'}
"""

    # Save validation report
    with open(base_output_path / 'validation_report.md', 'w') as f:
        f.write(validation_report)

    print(f"📄 Validation report saved to: {base_output_path / 'validation_report.md'}")



Report generated: evaluation_results/experiment_report.md

✅ Evaluation complete! Results saved in 'evaluation_results' directory
📊 Evaluated 6 models
📁 Check the experiment_report.md for detailed results


In [None]:
# ============================================================================
# 🚀 EXECUTE VALIDATION BLOCK
# ============================================================================

print("\n" + "="*80)
print("🔍 STARTING COMPREHENSIVE VALIDATION")
print("="*80)

# Execute all validation functions
validate_image_id_mapping()
print()
validate_category_mapping()
print()
validate_detection_format()
print()
validate_coco_format()
print()
validate_model_outputs()
print()
check_metrics_consistency()
print()
generate_validation_report()

print("\n" + "="*80)
print("✅ VALIDATION COMPLETED")
print("="*80)

In [None]:
from IPython.display import display, HTML
import os

os.chdir('/kaggle/working')

zip_filename = f"{CONFIG['experiment_name']}.zip"
folder_to_zip = "evaluation_results"

print(f"📦 Creating archive: {zip_filename}")

!zip -r -q {zip_filename} {folder_to_zip}/

zip_path = f'/kaggle/working/{zip_filename}'

if os.path.exists(zip_path):
    file_size = os.path.getsize(zip_path) / (1024*1024)  # у MB
    print(f"\n✅ Archive created successfully!")
    print(f"📁 File: {zip_filename}")
    print(f"📊 Size: {file_size:.1f} MB")
    print(f"📍 Path: {zip_path}")

    print(f"\n📋 Archive contents:")
    !zipinfo {zip_filename} | head -20

    display(HTML(f'''
    <div style="background-color: #e8f5e8; padding: 15px; border-radius: 10px; margin: 10px 0;">
        <h3>📥 Download Ready</h3>
        <a href="{zip_filename}" download style="background-color: #4CAF50; color: white; padding: 10px 20px; text-decoration: none; border-radius: 5px; font-weight: bold;">
            📥 Download {zip_filename} ({file_size:.1f} MB)
        </a>
    </div>
    '''))
else:
    print("❌ Error: Archive not created!")
    print("📁 Checking working directory contents:")
    !ls -la /kaggle/working/