In [21]:
# === Install ultralytics (YOLOv11) if not already installed ===
# Uncomment the following line if you need to install it:
# !pip install ultralytics pycocotools

import os
import time
from pathlib import Path
from typing import List, Dict, Any, Optional

import torch
import numpy as np
from PIL import Image

from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from pycocotools import mask as maskUtils

from ultralytics import YOLO

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: NVIDIA GeForce RTX 3060 Ti


In [22]:
# === Configuration ===

# Dataset paths (after running extract_uvo_frames.py)
DATA_ROOT = Path(r"C:\Users\nickg\Workspace\v-CLR-CNN-Based\datasets")

# UVO validation set (converted to COCO format by extract_uvo_frames.py)
# Run `python extract_uvo_frames.py` first to generate these files!
UVO_VAL_JSON = DATA_ROOT / "uvo_val_coco.json"
UVO_FRAMES_DIR = DATA_ROOT / "uvo_frames"

# YOLOv11 model variant (options: yolo11n, yolo11s, yolo11m, yolo11l, yolo11x)
# For segmentation, use: yolo11n-seg, yolo11s-seg, yolo11m-seg, yolo11l-seg, yolo11x-seg
YOLO_MODEL = "yolo11s-seg.pt"  # Small model with segmentation

# Inference settings
IMG_SIZE = 640  # YOLOv11 default input size
CONF_THRESHOLD = 0.001  # Low threshold to get more detections for AR computation
IOU_THRESHOLD = 0.7  # NMS IoU threshold
MAX_DET = 300  # Maximum detections per image

# Batch size for inference
BATCH_SIZE = 1  # Process one image at a time for accurate latency measurement

print(f"Dataset root: {DATA_ROOT}")
print(f"UVO annotations: {UVO_VAL_JSON}")
print(f"UVO frames: {UVO_FRAMES_DIR}")
print(f"YOLO model: {YOLO_MODEL}")
print("\nIMPORTANT: Run 'python extract_uvo_frames.py' first if you haven't already!")

Dataset root: C:\Users\nickg\Workspace\v-CLR-CNN-Based\datasets
UVO annotations: C:\Users\nickg\Workspace\v-CLR-CNN-Based\datasets\uvo_val_coco.json
UVO frames: C:\Users\nickg\Workspace\v-CLR-CNN-Based\datasets\uvo_frames
YOLO model: yolo11s-seg.pt

IMPORTANT: Run 'python extract_uvo_frames.py' first if you haven't already!


In [23]:
# === Verify dataset paths exist ===

if not UVO_VAL_JSON.exists():
    print(f"ERROR: Annotation file not found: {UVO_VAL_JSON}")
    print("\nPlease run the frame extraction script first:")
    print("    python extract_uvo_frames.py")
    raise FileNotFoundError(f"Run 'python extract_uvo_frames.py' to generate {UVO_VAL_JSON}")

if not UVO_FRAMES_DIR.exists():
    print(f"ERROR: Frames directory not found: {UVO_FRAMES_DIR}")
    print("\nPlease run the frame extraction script first:")
    print("    python extract_uvo_frames.py")
    raise FileNotFoundError(f"Run 'python extract_uvo_frames.py' to extract frames to {UVO_FRAMES_DIR}")

# Load COCO-format annotations
coco_uvo = COCO(str(UVO_VAL_JSON))
val_img_ids = sorted(coco_uvo.getImgIds())

print(f"Number of validation images: {len(val_img_ids)}")
print(f"Number of categories: {len(coco_uvo.getCatIds())}")

# Show some sample images
sample_imgs = coco_uvo.loadImgs(val_img_ids[:3])
print("\nSample images:")
for img in sample_imgs:
    print(f"  - {img['file_name']} ({img['width']}x{img['height']})")

loading annotations into memory...
Done (t=0.91s)
creating index...
index created!
Number of validation images: 22950
Number of categories: 1

Sample images:
  - -3zcsBnDVzU/180.png (854x480)
  - -3zcsBnDVzU/181.png (854x480)
  - -3zcsBnDVzU/182.png (854x480)
Done (t=0.91s)
creating index...
index created!
Number of validation images: 22950
Number of categories: 1

Sample images:
  - -3zcsBnDVzU/180.png (854x480)
  - -3zcsBnDVzU/181.png (854x480)
  - -3zcsBnDVzU/182.png (854x480)


In [11]:
# === Load YOLOv11 model ===

print(f"Loading YOLOv11 model: {YOLO_MODEL}")
model = YOLO(YOLO_MODEL)

# Move to GPU if available
if device == "cuda":
    model.to(device)

print(f"Model loaded successfully")
print(f"Model task: {model.task}")
print(f"Number of classes: {len(model.names)}")
print(f"Class names (first 10): {list(model.names.values())[:10]}")

Loading YOLOv11 model: yolo11s-seg.pt
Model loaded successfully
Model task: segment
Number of classes: 80
Class names (first 10): ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light']
Model loaded successfully
Model task: segment
Number of classes: 80
Class names (first 10): ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light']


In [12]:
# === Helper function to get image path ===

def get_image_path(coco: COCO, img_id: int, img_root: Path) -> Path:
    """
    Get the full path to an image given its COCO image ID.
    Handles .png <-> .jpg fallback if needed.
    """
    img_info = coco.loadImgs([img_id])[0]
    rel_path = Path(img_info["file_name"])
    path = img_root / rel_path
    
    if path.is_file():
        return path
    
    # Try alternative extension
    if path.suffix.lower() == ".png":
        alt = path.with_suffix(".jpg")
    elif path.suffix.lower() == ".jpg":
        alt = path.with_suffix(".png")
    else:
        alt = None
    
    if alt is not None and alt.is_file():
        return alt
    
    raise FileNotFoundError(
        f"Image not found for img_id={img_id}. Tried {path}"
        + (f" and {alt}" if alt is not None else "")
    )

In [13]:
# === Run YOLOv11 inference on the validation set ===

def run_yolo_inference(
    model: YOLO,
    coco: COCO,
    img_root: Path,
    img_ids: List[int],
    conf_thresh: float = 0.001,
    iou_thresh: float = 0.7,
    max_det: int = 300,
    img_size: int = 640,
) -> tuple:
    """
    Run YOLO inference on all images and collect detections.
    Returns: (bbox_detections, segm_detections, latency_stats)
    """
    dets_bbox = []
    dets_segm = []
    latencies = []
    
    print(f"Running inference on {len(img_ids)} images...")
    
    for idx, img_id in enumerate(img_ids):
        if (idx + 1) % 100 == 0:
            print(f"  Processed {idx + 1}/{len(img_ids)} images...")
        
        try:
            img_path = get_image_path(coco, img_id, img_root)
        except FileNotFoundError as e:
            print(f"  Warning: {e}")
            continue
        
        # Load image to get original size
        img = Image.open(img_path)
        orig_w, orig_h = img.size
        
        # Run inference with timing
        if device == "cuda":
            torch.cuda.synchronize()
        start_time = time.perf_counter()
        
        results = model.predict(
            source=str(img_path),
            conf=conf_thresh,
            iou=iou_thresh,
            max_det=max_det,
            imgsz=img_size,
            verbose=False,
            device=device,
        )
        
        if device == "cuda":
            torch.cuda.synchronize()
        end_time = time.perf_counter()
        
        latencies.append((end_time - start_time) * 1000)  # Convert to ms
        
        # Process results
        result = results[0]  # Single image
        
        if result.boxes is None or len(result.boxes) == 0:
            continue
        
        boxes = result.boxes.xyxy.cpu().numpy()  # [N, 4] in xyxy format
        scores = result.boxes.conf.cpu().numpy()  # [N]
        
        # Get masks if available (for segmentation models)
        has_masks = hasattr(result, 'masks') and result.masks is not None
        if has_masks:
            masks = result.masks.data.cpu().numpy()  # [N, H, W]
        
        for k in range(len(boxes)):
            x1, y1, x2, y2 = boxes[k]
            w = x2 - x1
            h = y2 - y1
            score = float(scores[k])
            
            if w <= 0 or h <= 0:
                continue
            
            # Bounding box detection (COCO format: [x, y, w, h])
            dets_bbox.append({
                "image_id": int(img_id),
                "category_id": 1,  # All detections mapped to category 1 for class-agnostic AR
                "bbox": [float(x1), float(y1), float(w), float(h)],
                "score": score,
            })
            
            # Segmentation detection
            if has_masks:
                # Convert mask to RLE
                mask_k = masks[k]
                # Resize mask to original image size if needed
                if mask_k.shape != (orig_h, orig_w):
                    from PIL import Image as PILImage
                    mask_pil = PILImage.fromarray((mask_k * 255).astype(np.uint8))
                    mask_pil = mask_pil.resize((orig_w, orig_h), PILImage.NEAREST)
                    mask_k = np.array(mask_pil) / 255.0
                
                # Convert to binary and encode as RLE
                mask_binary = (mask_k > 0.5).astype(np.uint8)
                mask_fortran = np.asfortranarray(mask_binary)
                rle = maskUtils.encode(mask_fortran)
                if isinstance(rle["counts"], bytes):
                    rle["counts"] = rle["counts"].decode("ascii")
                
                dets_segm.append({
                    "image_id": int(img_id),
                    "category_id": 1,
                    "segmentation": rle,
                    "score": score,
                })
            else:
                # Use box as proxy mask (rectangular RLE)
                poly = [float(x1), float(y1), float(x2), float(y1),
                        float(x2), float(y2), float(x1), float(y2)]
                rle = maskUtils.frPyObjects([poly], int(orig_h), int(orig_w))[0]
                if isinstance(rle["counts"], bytes):
                    rle["counts"] = rle["counts"].decode("ascii")
                
                dets_segm.append({
                    "image_id": int(img_id),
                    "category_id": 1,
                    "segmentation": rle,
                    "score": score,
                })
    
    # Compute latency statistics
    latency_stats = {
        "mean_ms": np.mean(latencies),
        "std_ms": np.std(latencies),
        "median_ms": np.median(latencies),
        "min_ms": np.min(latencies),
        "max_ms": np.max(latencies),
        "total_images": len(latencies),
        "fps": 1000.0 / np.mean(latencies) if np.mean(latencies) > 0 else 0,
    }
    
    print(f"\nInference complete!")
    print(f"  Total detections (bbox): {len(dets_bbox)}")
    print(f"  Total detections (segm): {len(dets_segm)}")
    print(f"  Average latency: {latency_stats['mean_ms']:.2f} ms ({latency_stats['fps']:.1f} FPS)")
    
    return dets_bbox, dets_segm, latency_stats

In [14]:
# === Run inference ===

dets_bbox, dets_segm, latency_stats = run_yolo_inference(
    model=model,
    coco=coco_uvo,
    img_root=UVO_FRAMES_DIR,
    img_ids=val_img_ids,
    conf_thresh=CONF_THRESHOLD,
    iou_thresh=IOU_THRESHOLD,
    max_det=MAX_DET,
    img_size=IMG_SIZE,
)

Running inference on 22950 images...
  Processed 100/22950 images...
  Processed 100/22950 images...
  Processed 200/22950 images...
  Processed 200/22950 images...
  Processed 300/22950 images...
  Processed 300/22950 images...
  Processed 400/22950 images...
  Processed 400/22950 images...
  Processed 500/22950 images...
  Processed 500/22950 images...
  Processed 600/22950 images...
  Processed 600/22950 images...
  Processed 700/22950 images...
  Processed 700/22950 images...
  Processed 800/22950 images...
  Processed 800/22950 images...
  Processed 900/22950 images...
  Processed 900/22950 images...
  Processed 1000/22950 images...
  Processed 1000/22950 images...
  Processed 1100/22950 images...
  Processed 1100/22950 images...
  Processed 1200/22950 images...
  Processed 1200/22950 images...
  Processed 1300/22950 images...
  Processed 1300/22950 images...
  Processed 1400/22950 images...
  Processed 1400/22950 images...
  Processed 1500/22950 images...
  Processed 1500/22950 i

In [24]:
# === COCO Evaluation ===

def evaluate_detections(coco_gt: COCO, dets_bbox: List[Dict], dets_segm: List[Dict]):
    """
    Evaluate detections using COCO metrics.
    Returns: dictionary with AR/AP metrics for both bbox and segm.
    """
    results = {}
    
    if len(dets_bbox) == 0:
        print("[WARN] No bounding box detections to evaluate!")
        return None
    
    # Bounding box evaluation
    print("\n" + "="*60)
    print("BOUNDING BOX EVALUATION")
    print("="*60)
    
    coco_dt_box = coco_gt.loadRes(dets_bbox)
    coco_eval_box = COCOeval(coco_gt, coco_dt_box, iouType="bbox")
    coco_eval_box.evaluate()
    coco_eval_box.accumulate()
    coco_eval_box.summarize()
    stats_box = coco_eval_box.stats
    
    results["APb"] = stats_box[0]      # AP @ IoU=0.50:0.95
    results["APb_50"] = stats_box[1]   # AP @ IoU=0.50
    results["APb_75"] = stats_box[2]   # AP @ IoU=0.75
    results["ARb_1"] = stats_box[6]    # AR @ 1 det
    results["ARb_10"] = stats_box[7]   # AR @ 10 dets
    results["ARb_100"] = stats_box[8]  # AR @ 100 dets
    
    # Segmentation evaluation
    if len(dets_segm) > 0:
        print("\n" + "="*60)
        print("SEGMENTATION EVALUATION")
        print("="*60)
        
        coco_dt_segm = coco_gt.loadRes(dets_segm)
        coco_eval_segm = COCOeval(coco_gt, coco_dt_segm, iouType="segm")
        coco_eval_segm.evaluate()
        coco_eval_segm.accumulate()
        coco_eval_segm.summarize()
        stats_segm = coco_eval_segm.stats
        
        results["APm"] = stats_segm[0]      # AP @ IoU=0.50:0.95
        results["APm_50"] = stats_segm[1]   # AP @ IoU=0.50
        results["APm_75"] = stats_segm[2]   # AP @ IoU=0.75
        results["ARm_1"] = stats_segm[6]    # AR @ 1 det
        results["ARm_10"] = stats_segm[7]   # AR @ 10 dets
        results["ARm_100"] = stats_segm[8]  # AR @ 100 dets
    
    return results


# Run evaluation
eval_results = evaluate_detections(coco_uvo, dets_bbox, dets_segm)


BOUNDING BOX EVALUATION
Loading and preparing results...
DONE (t=5.75s)
creating index...
DONE (t=5.75s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=333.68s).
Accumulating evaluation results...
DONE (t=333.68s).
Accumulating evaluation results...
DONE (t=21.57s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.265
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.382
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.276
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.053
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.189
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.432
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.083
 Average Recall     (AR) @[ IoU=0.5

In [25]:
# === Print Table-1-style results (matching v-CLR paper format) ===

print("\n" + "="*80)
print("FINAL RESULTS - Table-1-style (Non-VOC UVO Evaluation)")
print("="*80)

if eval_results is not None:
    # Convert to percentages
    ARb10 = eval_results.get("ARb_10", 0) * 100.0
    ARb100 = eval_results.get("ARb_100", 0) * 100.0
    ARm10 = eval_results.get("ARm_10", 0) * 100.0
    ARm100 = eval_results.get("ARm_100", 0) * 100.0
    
    APb = eval_results.get("APb", 0) * 100.0
    APm = eval_results.get("APm", 0) * 100.0
    
    print(f"\nModel: {YOLO_MODEL}")
    print(f"Image size: {IMG_SIZE}")
    print(f"Confidence threshold: {CONF_THRESHOLD}")
    print(f"\n{'Metric':<15} {'Value':>10}")
    print("-" * 25)
    print(f"{'AR^b_10':<15} {ARb10:>10.1f}")
    print(f"{'AR^b_100':<15} {ARb100:>10.1f}")
    print(f"{'AR^m_10':<15} {ARm10:>10.1f}")
    print(f"{'AR^m_100':<15} {ARm100:>10.1f}")
    print("-" * 25)
    print(f"{'AP^b':<15} {APb:>10.1f}")
    print(f"{'AP^m':<15} {APm:>10.1f}")
    
    print(f"\n{'='*40}")
    print("LATENCY STATISTICS")
    print(f"{'='*40}")
    print(f"{'Mean latency:':<20} {latency_stats['mean_ms']:>10.2f} ms")
    print(f"{'Std latency:':<20} {latency_stats['std_ms']:>10.2f} ms")
    print(f"{'Median latency:':<20} {latency_stats['median_ms']:>10.2f} ms")
    print(f"{'Min latency:':<20} {latency_stats['min_ms']:>10.2f} ms")
    print(f"{'Max latency:':<20} {latency_stats['max_ms']:>10.2f} ms")
    print(f"{'Throughput:':<20} {latency_stats['fps']:>10.1f} FPS")
    print(f"{'Total images:':<20} {latency_stats['total_images']:>10d}")
    
    # One-liner summary (matching v-CLR paper format)
    print(f"\n{'='*80}")
    print("One-liner summary:")
    print(
        f"YOLOv11 ({YOLO_MODEL}) | "
        f"AR^b_10 = {ARb10:.1f}  "
        f"AR^b_100 = {ARb100:.1f}  "
        f"AR^m_10 = {ARm10:.1f}  "
        f"AR^m_100 = {ARm100:.1f}  "
        f"Latency = {latency_stats['mean_ms']:.1f}ms"
    )
else:
    print("No evaluation results available.")


FINAL RESULTS - Table-1-style (Non-VOC UVO Evaluation)

Model: yolo11s-seg.pt
Image size: 640
Confidence threshold: 0.001

Metric               Value
-------------------------
AR^b_10               30.6
AR^b_100              51.0
AR^m_10               21.1
AR^m_100              31.6
-------------------------
AP^b                  26.5
AP^m                  17.1

LATENCY STATISTICS
Mean latency:            146.83 ms
Std latency:             117.14 ms
Median latency:          115.81 ms
Min latency:              23.05 ms
Max latency:            2142.77 ms
Throughput:                 6.8 FPS
Total images:             22950

One-liner summary:
YOLOv11 (yolo11s-seg.pt) | AR^b_10 = 30.6  AR^b_100 = 51.0  AR^m_10 = 21.1  AR^m_100 = 31.6  Latency = 146.8ms


In [None]:
# === Optional: Visualize some detections ===

import matplotlib.pyplot as plt
import matplotlib.patches as patches

def visualize_detections(model: YOLO, coco: COCO, img_root: Path, img_ids: List[int], 
                         num_images: int = 5, conf_thresh: float = 0.25):
    """
    Visualize YOLO detections on a few sample images.
    """
    fig, axes = plt.subplots(1, num_images, figsize=(4 * num_images, 4))
    if num_images == 1:
        axes = [axes]
    
    sample_ids = img_ids[:num_images]
    
    for ax, img_id in zip(axes, sample_ids):
        try:
            img_path = get_image_path(coco, img_id, img_root)
            
            # Run inference
            results = model.predict(
                source=str(img_path),
                conf=conf_thresh,
                verbose=False,
                device=device,
            )
            
            # Plot using ultralytics built-in visualization
            result = results[0]
            img_with_boxes = result.plot()
            
            # Convert BGR to RGB for matplotlib
            img_rgb = img_with_boxes[:, :, ::-1]
            
            ax.imshow(img_rgb)
            ax.set_title(f"Image ID: {img_id}")
            ax.axis("off")
            
        except Exception as e:
            ax.text(0.5, 0.5, f"Error: {e}", ha="center", va="center")
            ax.axis("off")
    
    plt.tight_layout()
    plt.show()


# Visualize a few samples
print("Visualizing sample detections...")
visualize_detections(model, coco_uvo, UVO_FRAMES_DIR, val_img_ids, num_images=5, conf_thresh=0.25)

In [None]:
# === Save results to file ===

import json
from datetime import datetime

results_summary = {
    "model": YOLO_MODEL,
    "dataset": "UVO Non-VOC Validation",
    "timestamp": datetime.now().isoformat(),
    "config": {
        "img_size": IMG_SIZE,
        "conf_threshold": CONF_THRESHOLD,
        "iou_threshold": IOU_THRESHOLD,
        "max_det": MAX_DET,
        "device": device,
    },
    "metrics": eval_results,
    "latency": latency_stats,
    "num_images": len(val_img_ids),
    "num_detections_bbox": len(dets_bbox),
    "num_detections_segm": len(dets_segm),
}

output_file = f"yolov11_baseline_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, "w") as f:
    json.dump(results_summary, f, indent=2)

print(f"Results saved to: {output_file}")

In [None]:
# === Benchmark different YOLO model variants (optional) ===

# Uncomment to run benchmarks across different model sizes

# YOLO_VARIANTS = [
#     "yolo11n-seg.pt",  # Nano
#     "yolo11s-seg.pt",  # Small
#     "yolo11m-seg.pt",  # Medium
#     "yolo11l-seg.pt",  # Large
#     "yolo11x-seg.pt",  # Extra-large
# ]
# 
# all_results = []
# 
# for variant in YOLO_VARIANTS:
#     print(f"\n{'='*60}")
#     print(f"Evaluating: {variant}")
#     print(f"{'='*60}")
#     
#     model_variant = YOLO(variant)
#     if device == "cuda":
#         model_variant.to(device)
#     
#     dets_b, dets_s, lat = run_yolo_inference(
#         model=model_variant,
#         coco=coco_nonvoc,
#         img_root=NONVOC_IMG_DIR,
#         img_ids=val_img_ids,
#         conf_thresh=CONF_THRESHOLD,
#         iou_thresh=IOU_THRESHOLD,
#         max_det=MAX_DET,
#         img_size=IMG_SIZE,
#     )
#     
#     results_v = evaluate_detections(coco_nonvoc, dets_b, dets_s)
#     
#     all_results.append({
#         "model": variant,
#         "metrics": results_v,
#         "latency": lat,
#     })
# 
# # Print comparison table
# print("\n" + "="*100)
# print("MODEL COMPARISON")
# print("="*100)
# print(f"{'Model':<20} {'AR^b_10':>10} {'AR^b_100':>10} {'AR^m_10':>10} {'AR^m_100':>10} {'Latency (ms)':>12} {'FPS':>8}")
# print("-"*100)
# for r in all_results:
#     if r["metrics"] is not None:
#         print(
#             f"{r['model']:<20} "
#             f"{r['metrics']['ARb_10']*100:>10.1f} "
#             f"{r['metrics']['ARb_100']*100:>10.1f} "
#             f"{r['metrics']['ARm_10']*100:>10.1f} "
#             f"{r['metrics']['ARm_100']*100:>10.1f} "
#             f"{r['latency']['mean_ms']:>12.1f} "
#             f"{r['latency']['fps']:>8.1f}"
#         )