# YOLO Compression Benchmark on MOT17

Benchmarks **inference speed** and **detection quality** for three compression
strategies applied to YOLO person detectors, all evaluated on the same
MOT17-02-SDP pedestrian tracking sequence.

## Experiments
| # | Experiment | Description |
|---|------------|-------------|
| 1 | **Baseline** | YOLOv8 / YOLO11 / YOLO26 × nano/small/medium — raw comparison |
| 2 | **Quantization** | yolov8m: FP32 → ONNX FP32 → ONNX FP16 → TRT variants (Jetson) |
| 3 | **Structured pruning** | yolov8m: L1 channel pruning at 0 %, 30 %, 50 % sparsity |

## Metrics
| Metric | Description |
|--------|-------------|
| `mean_ms` | Mean inference time per image (ms) |
| `std_ms` | Standard deviation of inference time (ms) |
| `total_s` | Total benchmark wall time (s) |
| `f1` | F1 score @ IoU = 0.5 vs. MOT17 ground truth |

> **Platform note** — TRT experiments require CUDA (Jetson).
> ONNX FP16 uses `CUDAExecutionProvider` on Jetson (full fp16 speedup) and falls back
> to `CPUExecutionProvider` on CPU where ONNXRuntime may upcast to fp32 internally.


In [2]:
import time
import warnings
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import torch
from ultralytics import YOLO

warnings.filterwarnings("ignore")
%matplotlib inline
plt.rcParams.update({
    "figure.dpi": 120,
    "font.size": 11,
    "axes.spines.top": False,
    "axes.spines.right": False,
})

_dev_auto = "0" if torch.cuda.is_available() else "cpu"
print(f"PyTorch        : {torch.__version__}")
print(f"Auto device    : {_dev_auto}")
print(f"CUDA available : {torch.cuda.is_available()}")


PyTorch        : 2.10.0
Auto device    : mps
CUDA available : False


In [3]:
# ============================================================
# CONFIG — edit here to adapt to your setup
# ============================================================

# Single MOT17 sequence used for ALL experiments
SEQUENCE    = "data/MOT17/train/MOT17-02-SDP"

# Inference device: "cpu" | "0" (first CUDA GPU on Jetson)
DEVICE      = _dev_auto

# Warm-up frames discarded before timing starts
WARMUP      = 10

# Max frames per run (keeps notebook fast; set None for all 600)
FRAME_LIMIT = 200

# Common inference settings
IMGSZ       = 640
CONF        = 0.25

# IoU threshold used for GT matching (detection quality)
IOU_THRESH  = 0.5

# Nine baseline models (3 versions × 3 sizes)
MODELS = [
    "yolov8n.pt", "yolov8s.pt", "yolov8m.pt",
    "yolo11n.pt", "yolo11s.pt", "yolo11m.pt",
    "yolo26n.pt", "yolo26s.pt", "yolo26m.pt",
]

# Pruning sparsity ratios to evaluate
PRUNE_RATIOS = [0.0, 0.3, 0.5]


In [4]:
# ── Frame collection ──────────────────────────────────────────────────────

def collect_frames(sequence_dir: str, limit: int | None = None) -> list[str]:
    """Return sorted list of .jpg frame paths from a single MOT17 sequence."""
    pattern = str(Path(sequence_dir) / "img1" / "*.jpg")
    frames = sorted(glob(pattern))
    if not frames:
        raise FileNotFoundError(f"No frames found at {pattern!r}")
    if limit:
        frames = frames[:limit]
    return frames


# ── Ground-truth loading ──────────────────────────────────────────────────

def load_gt(sequence_dir: str) -> dict[int, list]:
    """
    Parse MOT17 gt.txt.
    Format: frame, track_id, x, y, w, h, conf, class_id, visibility
      conf == 1  → annotation is active in this frame
      class_id == 1  → pedestrian (the only class YOLO detects here)
    Returns {frame_id: [[x1,y1,x2,y2], ...]}
    """
    gt_path = Path(sequence_dir) / "gt" / "gt.txt"
    gt: dict[int, list] = {}
    with open(gt_path) as fh:
        for line in fh:
            p = line.strip().split(",")
            frame_id  = int(p[0])
            conf_flag = int(p[6])      # 1 = active
            class_id  = int(p[7])      # 1 = pedestrian
            if conf_flag != 1 or class_id != 1:
                continue
            x, y, w, h = float(p[2]), float(p[3]), float(p[4]), float(p[5])
            if w <= 0 or h <= 0:
                continue
            gt.setdefault(frame_id, []).append([x, y, x + w, y + h])
    return gt


# ── IoU & matching ────────────────────────────────────────────────────────

def _box_iou(b1: list, b2: list) -> float:
    """Axis-aligned IoU for two boxes in xyxy format."""
    xi1 = max(b1[0], b2[0]); yi1 = max(b1[1], b2[1])
    xi2 = min(b1[2], b2[2]); yi2 = min(b1[3], b2[3])
    inter = max(0.0, xi2 - xi1) * max(0.0, yi2 - yi1)
    a1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
    a2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
    union = a1 + a2 - inter
    return inter / union if union > 0 else 0.0


def _match(pred_boxes: np.ndarray, gt_boxes: list, iou_thresh: float) -> tuple[int, int, int]:
    """Greedy IoU matching → (TP, FP, FN)."""
    n_gt, n_pred = len(gt_boxes), len(pred_boxes)
    if n_gt == 0 and n_pred == 0:
        return 0, 0, 0
    if n_gt == 0:
        return 0, n_pred, 0
    if n_pred == 0:
        return 0, 0, n_gt

    matched_gt: set[int] = set()
    tp = 0
    for pred in pred_boxes:
        best_iou, best_j = iou_thresh, -1
        for j, gt in enumerate(gt_boxes):
            if j in matched_gt:
                continue
            iou = _box_iou(pred.tolist(), gt)
            if iou > best_iou:
                best_iou, best_j = iou, j
        if best_j >= 0:
            tp += 1
            matched_gt.add(best_j)

    return tp, n_pred - tp, n_gt - tp


# ── Core benchmark routine ────────────────────────────────────────────────

def run_benchmark(
    model: YOLO,
    frames: list[str],
    gt_by_frame: dict,
    imgsz: int = IMGSZ,
    conf: float = CONF,
    warmup: int = WARMUP,
) -> dict:
    """
    Warm-up then time frame-by-frame inference.
    Simultaneously accumulates TP/FP/FN against MOT17 ground truth.

    Returns a dict with keys:
        mean_ms, std_ms, total_s   – timing
        det_precision, det_recall, f1  – detection quality
        n_images                   – number of benchmarked frames
    """
    assert len(frames) > warmup, f"Need > {warmup} frames (got {len(frames)})."

    # Warm-up (not timed)
    for img in frames[:warmup]:
        model.predict(img, imgsz=imgsz, conf=conf, device=DEVICE, verbose=False)

    bench = frames[warmup:]
    times_ms: list[float] = []
    tp_total = fp_total = fn_total = 0

    for img_path in bench:
        frame_id = int(Path(img_path).stem)
        gt_boxes = gt_by_frame.get(frame_id, [])

        t0 = time.perf_counter()
        results = model.predict(img_path, imgsz=imgsz, conf=conf, device=DEVICE, verbose=False)
        times_ms.append((time.perf_counter() - t0) * 1e3)

        boxes_obj = results[0].boxes
        if boxes_obj is not None and len(boxes_obj):
            cls  = boxes_obj.cls.cpu().numpy()
            xyxy = boxes_obj.xyxy.cpu().numpy()
            pred_boxes = xyxy[cls == 0]          # COCO class 0 = person
        else:
            pred_boxes = np.zeros((0, 4))

        tp, fp, fn = _match(pred_boxes, gt_boxes, IOU_THRESH)
        tp_total += tp; fp_total += fp; fn_total += fn

    mean_ms = float(np.mean(times_ms))
    std_ms  = float(np.std(times_ms))
    total_s = sum(times_ms) / 1000.0

    denom_p = tp_total + fp_total
    denom_r = tp_total + fn_total
    det_prec = tp_total / denom_p if denom_p > 0 else 0.0
    det_rec  = tp_total / denom_r if denom_r > 0 else 0.0
    f1 = (2 * det_prec * det_rec / (det_prec + det_rec)
          if (det_prec + det_rec) > 0 else 0.0)

    return dict(
        mean_ms=round(mean_ms, 3),
        std_ms=round(std_ms, 3),
        total_s=round(total_s, 3),
        det_precision=round(det_prec, 4),
        det_recall=round(det_rec, 4),
        f1=round(f1, 4),
        n_images=len(bench),
    )


---
## Data

All experiments use the same 200-frame window from **MOT17-02-SDP**
(600 frames total; capped at `FRAME_LIMIT` for speed).
Ground-truth boxes are filtered to active pedestrian annotations (`conf=1, class=1`).


In [5]:
frames      = collect_frames(SEQUENCE, limit=FRAME_LIMIT)
gt_by_frame = load_gt(SEQUENCE)

n_bench = len(frames) - WARMUP
gt_frames_in_bench = {fid: boxes for fid, boxes in gt_by_frame.items()
                      if fid <= int(Path(frames[-1]).stem)}
n_gt_boxes = sum(len(v) for v in gt_frames_in_bench.values())

print(f"Sequence   : {Path(SEQUENCE).name}")
print(f"Frames     : {len(frames)} total  |  {WARMUP} warm-up  |  {n_bench} benchmarked")
print(f"GT boxes   : {n_gt_boxes} pedestrian boxes across "
      f"{len(gt_frames_in_bench)} frames")


Sequence   : MOT17-02-SDP
Frames     : 200 total  |  10 warm-up  |  190 benchmarked
GT boxes   : 5354 pedestrian boxes across 200 frames


---
## Experiment 1 — Baseline Model Comparison

All nine models are evaluated with default settings (`imgsz=640`, `conf=0.25`).

| Version | nano | small | medium |
|---------|------|-------|--------|
| YOLOv8  | yolov8n | yolov8s | yolov8m |
| YOLO11  | yolo11n | yolo11s | yolo11m |
| YOLO26  | yolo26n | yolo26s | yolo26m |


In [None]:
rows_baseline = []

for model_name in MODELS:
    print(f"  {model_name:<14}", end=" ", flush=True)
    try:
        yolo = YOLO(model_name)
        params_m = sum(p.numel() for p in yolo.model.parameters()) / 1e6
        m = run_benchmark(yolo, frames, gt_by_frame)
        rows_baseline.append({"model": model_name, "params_M": round(params_m, 2), **m})
        print(f"{m['mean_ms']:>6.1f} ± {m['std_ms']:>4.1f} ms   "
              f"F1={m['f1']:.3f}   {params_m:.1f}M params")
    except Exception as exc:
        print(f"SKIP: {exc}")
    finally:
        try:
            del yolo
        except NameError:
            pass
        gc.collect()
        if torch.cuda.is_available(): torch.cuda.empty_cache()

df_baseline = pd.DataFrame(rows_baseline)
Path("results").mkdir(exist_ok=True)
if df_baseline.empty:
    print("\n[ERROR] No results — all models failed. Check DEVICE and re-run.")
else:
    df_baseline.to_csv("results/baseline_nb.csv", index=False)
    display(df_baseline[["model","params_M","mean_ms","std_ms","total_s","det_precision","det_recall","f1"]])


In [None]:
if df_baseline.empty:
    print("No baseline results to plot.")
else:
    _PALETTE = {"yolov8": "#4C72B0", "yolo11": "#DD8452", "yolo26": "#55A868"}
    
    def _bar_color(model_name: str) -> str:
        for key, col in _PALETTE.items():
            if key in model_name:
                return col
        return "#888"
    
    labels = [m.replace(".pt", "") for m in df_baseline["model"]]
    bar_cols = [_bar_color(m) for m in df_baseline["model"]]
    legend_patches = [mpatches.Patch(color=c, label=v) for v, c in _PALETTE.items()]
    
    fig, axes = plt.subplots(1, 3, figsize=(17, 5))
    fig.suptitle("Experiment 1 — Baseline Model Comparison", fontsize=14, fontweight="bold")
    
    # ── Speed ──────────────────────────────────────────────────────────────────
    ax = axes[0]
    ax.bar(labels, df_baseline["mean_ms"], color=bar_cols,
           yerr=df_baseline["std_ms"], capsize=4, alpha=0.85, ecolor="#444")
    ax.set_ylabel("Mean inference time (ms / img)")
    ax.set_title("Inference Speed")
    ax.set_xticklabels(labels, rotation=45, ha="right")
    ax.legend(handles=legend_patches, fontsize=9)
    
    # ── F1 score ───────────────────────────────────────────────────────────────
    ax = axes[1]
    ax.bar(labels, df_baseline["f1"], color=bar_cols, alpha=0.85)
    ax.set_ylabel("F1 @ IoU = 0.5")
    ax.set_title("Detection Quality")
    ax.set_ylim(0, 1)
    ax.set_xticklabels(labels, rotation=45, ha="right")
    ax.legend(handles=legend_patches, fontsize=9)
    
    # ── Speed–accuracy scatter ─────────────────────────────────────────────────
    ax = axes[2]
    for version, color in _PALETTE.items():
        mask = df_baseline["model"].str.contains(version)
        sub  = df_baseline[mask]
        ax.scatter(sub["mean_ms"], sub["f1"], c=color, s=90, label=version, zorder=3)
        for _, r in sub.iterrows():
            size_tag = r["model"].replace(".pt", "").replace(version, "")[-1]  # n/s/m
            ax.annotate(f" {size_tag}", (r["mean_ms"], r["f1"]),
                        fontsize=8, va="center")
    ax.set_xlabel("Mean inference time (ms / img)")
    ax.set_ylabel("F1 @ IoU = 0.5")
    ax.set_title("Speed–Accuracy Trade-off")
    ax.legend(fontsize=9)
    
    plt.tight_layout()
    plt.savefig("results/baseline_comparison.png", dpi=150, bbox_inches="tight")
    plt.show()
    print("Saved → results/baseline_comparison.png")


In [None]:
if df_baseline.empty:
    print("No baseline results to plot.")
else:
    fig, ax = plt.subplots(figsize=(10, 4))
    ax.bar(labels, df_baseline["params_M"], color=bar_cols, alpha=0.85)
    ax.set_ylabel("Parameters (M)")
    ax.set_title("Model Size (Parameter Count)")
    ax.set_xticklabels(labels, rotation=45, ha="right")
    ax.legend(handles=legend_patches, fontsize=9)
    plt.tight_layout()
    plt.savefig("results/baseline_params.png", dpi=150, bbox_inches="tight")
    plt.show()


---
## Experiment 2 — Quantization

Tests **yolov8m** at progressively reduced numerical precision.

| Format | Runtime | Available |
|--------|---------|-----------|
| `fp32` | PyTorch | ✓ always |
| `onnx_fp32` | ONNXRuntime | ✓ CPU + Jetson |
| `onnx_fp16` | ONNXRuntime | ✓ CPU (may upcast) + Jetson CUDA (full fp16) |
| `trt_fp16` | TensorRT | ✗ Jetson only |
| `trt_int8` | TensorRT INT8 | ✗ Jetson only |

F1 is computed for all YOLO-loaded formats. Raw ONNX-only paths skip F1.


In [None]:
import gc

rows_quant = []
QUANT_MODEL = "yolov8m.pt"

# ── 1. FP32 PyTorch baseline ───────────────────────────────────────────────
print("[fp32]", flush=True)
yolo_fp32 = YOLO(QUANT_MODEL)
m = run_benchmark(yolo_fp32, frames, gt_by_frame)
rows_quant.append({"format": "fp32", **m})
print(f"  {m['mean_ms']:.1f} ± {m['std_ms']:.1f} ms   F1={m['f1']:.3f}")
del yolo_fp32; gc.collect()
if torch.cuda.is_available(): torch.cuda.empty_cache()

# ── 2. ONNX FP32 (measures ONNXRuntime overhead vs PyTorch) ───────────────
print("\n[onnx_fp32]", flush=True)
_onnx_fp32 = Path("results/quant_fp32.onnx")
try:
    if not _onnx_fp32.exists():
        YOLO(QUANT_MODEL).export(format="onnx", half=False,
                                  imgsz=IMGSZ, dynamic=False, verbose=False)
        Path(QUANT_MODEL.replace(".pt", ".onnx")).rename(_onnx_fp32)
    yolo_onnx_fp32 = YOLO(str(_onnx_fp32))
    m = run_benchmark(yolo_onnx_fp32, frames, gt_by_frame)
    rows_quant.append({"format": "onnx_fp32", **m})
    print(f"  {m['mean_ms']:.1f} ± {m['std_ms']:.1f} ms   F1={m['f1']:.3f}")
    del yolo_onnx_fp32
except Exception as exc:
    print(f"  SKIP: {exc}")
finally:
    gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()

# ── 3. ONNX FP16 (half-precision weights; ort CPU may upcast internally) ──
print("\n[onnx_fp16]", flush=True)
_onnx_fp16 = Path("results/quant_fp16.onnx")
try:
    if not _onnx_fp16.exists():
        YOLO(QUANT_MODEL).export(format="onnx", half=True,
                                  imgsz=IMGSZ, dynamic=False, verbose=False)
        Path(QUANT_MODEL.replace(".pt", ".onnx")).rename(_onnx_fp16)
    yolo_onnx_fp16 = YOLO(str(_onnx_fp16))
    m = run_benchmark(yolo_onnx_fp16, frames, gt_by_frame)
    rows_quant.append({"format": "onnx_fp16", **m})
    print(f"  {m['mean_ms']:.1f} ± {m['std_ms']:.1f} ms   F1={m['f1']:.3f}")
    del yolo_onnx_fp16
except Exception as exc:
    print(f"  SKIP: {exc}")
finally:
    gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()

# ── 4. TRT variants (CUDA / Jetson only) ──────────────────────────────────
if torch.cuda.is_available():
    for label, half, int8 in [("trt_fp16", True, False), ("trt_int8", False, True)]:
        print(f"\n[{label}]", flush=True)
        _eng = Path(f"results/quant_{label}.engine")
        try:
            if not _eng.exists():
                YOLO(QUANT_MODEL).export(format="engine", imgsz=IMGSZ,
                                         half=half, int8=int8,
                                         data="config/mot17.yaml", verbose=False)
                Path(QUANT_MODEL.replace(".pt", ".engine")).rename(_eng)
            yolo_trt = YOLO(str(_eng))
            m = run_benchmark(yolo_trt, frames, gt_by_frame)
            rows_quant.append({"format": label, **m})
            print(f"  {m['mean_ms']:.1f} ± {m['std_ms']:.1f} ms   F1={m['f1']:.3f}")
            del yolo_trt
        except Exception as exc:
            print(f"  SKIP: {exc}")
        finally:
            gc.collect()
            torch.cuda.empty_cache()
else:
    print("\n[INFO] No CUDA → trt_fp16 and trt_int8 skipped.")
    print("       Run on Jetson with DEVICE='0' to include TRT results.")

df_quant = pd.DataFrame(rows_quant)
if df_quant.empty:
    print("\n[ERROR] No quantization results recorded.")
else:
    df_quant.to_csv("results/quantized_nb.csv", index=False)
    display(df_quant[["format","mean_ms","std_ms","total_s","f1"]])


In [None]:
if df_quant.empty:
    print("No quantization results to plot.")
else:
    _q_colors = {
        "fp32":       "#4C72B0",
        "onnx_fp32":  "#6fa8dc",
        "onnx_fp16":  "#4db6ac",
        "trt_fp16":   "#ff7043",
        "trt_int8":   "#e53935",
    }
    
    present = df_quant["format"].tolist()
    q_cols  = [_q_colors.get(f, "#aaa") for f in present]
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    fig.suptitle("Experiment 2 — Quantization (yolov8m)", fontsize=14, fontweight="bold")
    
    # ── Speed ──────────────────────────────────────────────────────────────────
    ax = axes[0]
    ax.bar(present, df_quant["mean_ms"], color=q_cols, alpha=0.85,
           yerr=df_quant["std_ms"], capsize=5, ecolor="#444")
    ax.set_ylabel("Mean inference time (ms / img)")
    ax.set_title("Inference Speed")
    ax.set_xticklabels(present, rotation=30, ha="right")
    # Annotate speedup relative to fp32
    fp32_ms = df_quant.loc[df_quant["format"] == "fp32", "mean_ms"].values
    if len(fp32_ms):
        for i, (_, row) in enumerate(df_quant.iterrows()):
            ratio = fp32_ms[0] / row["mean_ms"] if row["mean_ms"] > 0 else 1
            ax.text(i, row["mean_ms"] + df_quant["std_ms"].max() * 0.1,
                    f"{ratio:.2f}×", ha="center", va="bottom", fontsize=9)
    
    # ── F1 ─────────────────────────────────────────────────────────────────────
    ax = axes[1]
    f1_vals = df_quant["f1"].fillna(0)
    ax.bar(present, f1_vals, color=q_cols, alpha=0.85)
    ax.set_ylabel("F1 @ IoU = 0.5")
    ax.set_title("Detection Quality")
    ax.set_ylim(0, 1)
    ax.set_xticklabels(present, rotation=30, ha="right")
    ax.axhline(float(df_quant.loc[df_quant["format"] == "fp32", "f1"].values[0]),
               color="red", linestyle="--", linewidth=1, label="FP32 baseline")
    ax.legend(fontsize=9)
    
    plt.tight_layout()
    plt.savefig("results/quantization_comparison.png", dpi=150, bbox_inches="tight")
    plt.show()
    print("Saved → results/quantization_comparison.png")


---
## Experiment 3 — Structured Pruning

Applies **L1-norm channel pruning** (`torch-pruning` library) to yolov8m at
three sparsity ratios.  No fine-tuning is performed — this isolates the
raw effect of pruning on both speed and accuracy.

| Ratio | Effect |
|-------|--------|
| 0 % | Baseline — no pruning |
| 30 % | 30 % of channels removed |
| 50 % | 50 % of channels removed |


In [None]:
import gc

try:
    import torch_pruning as tp
    _tp_available = True
except ImportError:
    print("torch-pruning not installed — pip install torch-pruning")
    _tp_available = False

PRUNE_MODEL = "yolov8m.pt"


def _apply_pruning(torch_model: torch.nn.Module, ratio: float) -> torch.nn.Module:
    """L1 structured channel pruning at the given sparsity ratio."""
    example_input = torch.randn(1, 3, IMGSZ, IMGSZ)

    # Protect the detection head layers from being pruned
    ignored = []
    for m in torch_model.modules():
        if isinstance(m, torch.nn.Linear):
            ignored.append(m)
        if hasattr(m, "cv3") or hasattr(m, "dfl"):
            ignored.append(m)

    pruner = tp.pruner.MagnitudePruner(
        torch_model,
        example_input,
        importance=tp.importance.MagnitudeImportance(p=1),
        iterative_steps=1,
        pruning_ratio=ratio,
        ignored_layers=ignored,
    )
    pruner.step()
    return torch_model


rows_pruning = []

if _tp_available:
    for ratio in PRUNE_RATIOS:
        label = f"{int(ratio * 100)}%"
        print(f"[ratio={label}]", end=" ", flush=True)
        try:
            yolo_p = YOLO(PRUNE_MODEL)
            params_before = sum(p.numel() for p in yolo_p.model.parameters()) / 1e6

            if ratio > 0.0:
                yolo_p.model = _apply_pruning(yolo_p.model, ratio)

            params_after = sum(p.numel() for p in yolo_p.model.parameters()) / 1e6
            m = run_benchmark(yolo_p, frames, gt_by_frame)

            rows_pruning.append({
                "ratio":           ratio,
                "label":           label,
                "params_before_M": round(params_before, 2),
                "params_after_M":  round(params_after, 2),
                **m,
            })
            print(f"{m['mean_ms']:.1f} ± {m['std_ms']:.1f} ms   "
                  f"F1={m['f1']:.3f}   "
                  f"{params_before:.1f}M → {params_after:.1f}M params")
        except Exception as exc:
            print(f"SKIP: {exc}")
        finally:
            try:
                del yolo_p
            except NameError:
                pass
            gc.collect()
            if torch.cuda.is_available(): torch.cuda.empty_cache()
else:
    print("Pruning skipped.")

df_pruning = pd.DataFrame(rows_pruning) if rows_pruning else pd.DataFrame()
if not df_pruning.empty:
    df_pruning.to_csv("results/pruning_nb.csv", index=False)
    display(df_pruning[["label","params_before_M","params_after_M",
                         "mean_ms","std_ms","total_s","f1"]])


In [None]:
if not df_pruning.empty:
    _p_colors = ["#4C72B0", "#DD8452", "#C44E52"]
    labels_p  = df_pruning["label"].tolist()
    p_cols    = _p_colors[:len(labels_p)]

    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
    fig.suptitle("Experiment 3 — Structured Pruning (yolov8m)", fontsize=14, fontweight="bold")

    # ── Speed ────────────────────────────────────────────────────────────────
    ax = axes[0]
    ax.bar(labels_p, df_pruning["mean_ms"], color=p_cols, alpha=0.85,
           yerr=df_pruning["std_ms"], capsize=5, ecolor="#444")
    ax.set_ylabel("Mean inference time (ms / img)")
    ax.set_title("Inference Speed")
    # Annotate speedup vs baseline (0 %)
    base_ms = df_pruning["mean_ms"].iloc[0]
    for i, row in df_pruning.iterrows():
        ratio_ = base_ms / row["mean_ms"] if row["mean_ms"] > 0 else 1
        ax.text(i, row["mean_ms"] + df_pruning["std_ms"].max() * 0.1,
                f"{ratio_:.2f}×", ha="center", va="bottom", fontsize=9)

    # ── F1 ───────────────────────────────────────────────────────────────────
    ax = axes[1]
    ax.bar(labels_p, df_pruning["f1"], color=p_cols, alpha=0.85)
    ax.set_ylabel("F1 @ IoU = 0.5")
    ax.set_title("Detection Quality")
    ax.set_ylim(0, 1)
    ax.axhline(float(df_pruning["f1"].iloc[0]),
               color="red", linestyle="--", linewidth=1, label="Unpruned baseline")
    ax.legend(fontsize=9)

    # ── Params ───────────────────────────────────────────────────────────────
    ax = axes[2]
    x = range(len(labels_p))
    w = 0.35
    ax.bar([xi - w / 2 for xi in x], df_pruning["params_before_M"],
           width=w, alpha=0.6, label="Before pruning", color="#4C72B0")
    ax.bar([xi + w / 2 for xi in x], df_pruning["params_after_M"],
           width=w, alpha=0.85, label="After pruning", color="#DD8452")
    ax.set_xticks(list(x)); ax.set_xticklabels(labels_p)
    ax.set_ylabel("Parameters (M)")
    ax.set_title("Parameter Reduction")
    ax.legend(fontsize=9)

    plt.tight_layout()
    plt.savefig("results/pruning_comparison.png", dpi=150, bbox_inches="tight")
    plt.show()
    print("Saved → results/pruning_comparison.png")
else:
    print("No pruning results to plot.")


---
## Summary

The table below merges all three experiments into a single view ordered by
`mean_ms` (fastest first).  Use it to compare the speed–accuracy frontier
across model families, quantization formats, and pruning levels.


In [None]:
summary_rows = []

# Baseline models
for _, r in df_baseline.iterrows():
    summary_rows.append({
        "experiment": "baseline",
        "label":      r["model"].replace(".pt", ""),
        "params_M":   r["params_M"],
        "mean_ms":    r["mean_ms"],
        "std_ms":     r["std_ms"],
        "total_s":    r["total_s"],
        "f1":         r["f1"],
    })

# Quantization
for _, r in df_quant.iterrows():
    summary_rows.append({
        "experiment": "quantization",
        "label":      f"yolov8m [{r['format']}]",
        "params_M":   None,
        "mean_ms":    r["mean_ms"],
        "std_ms":     r["std_ms"],
        "total_s":    r["total_s"],
        "f1":         r.get("f1", None),
    })

# Pruning
if not df_pruning.empty:
    for _, r in df_pruning.iterrows():
        summary_rows.append({
            "experiment": "pruning",
            "label":      f"yolov8m pruned {r['label']}",
            "params_M":   r["params_after_M"],
            "mean_ms":    r["mean_ms"],
            "std_ms":     r["std_ms"],
            "total_s":    r["total_s"],
            "f1":         r["f1"],
        })

df_summary = pd.DataFrame(summary_rows).sort_values("mean_ms").reset_index(drop=True)
df_summary.to_csv("results/summary_nb.csv", index=False)
df_summary.style.background_gradient(subset=["mean_ms"], cmap="RdYlGn_r") \
               .background_gradient(subset=["f1"],      cmap="RdYlGn") \
               .format(precision=3)


In [None]:
_exp_colors = {
    "baseline":     "#4C72B0",
    "quantization": "#DD8452",
    "pruning":      "#55A868",
}

fig, ax = plt.subplots(figsize=(11, 6))
fig.suptitle("Speed–Accuracy Frontier — All Experiments", fontsize=14, fontweight="bold")

for exp, grp in df_summary.groupby("experiment"):
    valid = grp.dropna(subset=["f1"])
    ax.scatter(valid["mean_ms"], valid["f1"],
               c=_exp_colors.get(exp, "#888"), s=90, label=exp,
               alpha=0.85, zorder=3)
    for _, r in valid.iterrows():
        ax.annotate(f"  {r['label']}", (r["mean_ms"], r["f1"]),
                    fontsize=7, va="center", alpha=0.8)

ax.set_xlabel("Mean inference time (ms / img)", fontsize=12)
ax.set_ylabel("F1 @ IoU = 0.5", fontsize=12)
ax.set_ylim(0, 1)
ax.legend(title="Experiment", fontsize=10)
ax.grid(True, linestyle="--", alpha=0.4)

plt.tight_layout()
plt.savefig("results/summary_frontier.png", dpi=150, bbox_inches="tight")
plt.show()
print("Saved → results/summary_frontier.png")
