In [None]:
%%bash
pip uninstall -y -q spacy thinc catalogue cymem preshed murmurhash srsly blis \
                    pathy typer wasabi pydantic || true
pip install -q --force-reinstall "numpy==1.26.4"

pip install -q \
  onnx==1.17.0 \
  onnxruntime==1.22.0 \
  onnxslim==0.1.53 \
  onnxconverter-common==1.14.0

pip install -q ultralytics pillow requests pandas tabulate

In [None]:
from ultralytics import YOLO
from pathlib import Path

PT_MODELS_TO_ENSURE_DOWNLOADED = ["n", "s", "m", "l", "x"]

for tag in PT_MODELS_TO_ENSURE_DOWNLOADED:
    pt_filename = f"yolo11{tag}.pt"
    if not Path(pt_filename).exists():
        print(f"Downloading {pt_filename}...")
        _ = YOLO(pt_filename)

    onnx_640_path = Path(f"yolo11{tag}.onnx")
    if not onnx_640_path.exists() and Path(pt_filename).exists():
        print(f"Exporting {pt_filename} to {onnx_640_path} (640x640)...")
        try:
            YOLO(pt_filename).export(format="onnx", imgsz=640, simplify=True, verbose=False)
        except Exception as e:
            print(f"Error exporting {pt_filename} to 640x640 ONNX: {e}")
print("Base PyTorch models checked/downloaded.")

In [None]:
import os, io, time, random, requests, numpy as np, pandas as pd
from pathlib import Path
from PIL import Image
import onnx, onnxruntime as ort
from onnxruntime.quantization import (
    quantize_static, CalibrationDataReader,
    QuantType, QuantFormat
)
from tabulate import tabulate
import torch
from ultralytics import YOLO

TARGET_MODEL_TAG_640 = "n"

INPUT_SIZE_640 = 640
picsum_url_640 = "https://picsum.photos/seed/{}/640/640.jpg"
calibration_images_640 = []
num_calib_images_640 = 128
print(f"Collecting images ({INPUT_SIZE_640}x{INPUT_SIZE_640})...")
while len(calibration_images_640) < num_calib_images_640:
    try:
        img_bytes = requests.get(picsum_url_640.format(random.randint(0, 99999)), timeout=8).content
        img_pil = Image.open(io.BytesIO(img_bytes)).convert("RGB").resize((INPUT_SIZE_640, INPUT_SIZE_640), Image.LANCZOS)
        img_arr = np.asarray(img_pil, np.float32).transpose(2, 0, 1) / 255.0
        calibration_images_640.append(img_arr[np.newaxis])
    except Exception:
        pass

class CalibrationPicsReader(CalibrationDataReader):
    def __init__(self, input_name_str: str, calib_data_list: list):
        self.input_name = input_name_str
        self.data_list = calib_data_list
        self.data_iter = iter([{self.input_name: data.copy()} for data in self.data_list])
    def get_next(self):
        return next(self.data_iter, None)
    def rewind(self):
        self.data_iter = iter([{self.input_name: data.copy()} for data in self.data_list])

def measure_latency_onnx(ort_session, input_feed_dict, runs=100):
    ort_session.run(None, input_feed_dict)
    t_start = time.perf_counter()
    for _ in range(runs):
        ort_session.run(None, input_feed_dict)
    return (time.perf_counter() - t_start) * 1000 / runs

@torch.no_grad()
def measure_latency_pytorch(pt_model, torch_input_tensor, runs=100):
    pt_model(torch_input_tensor)
    t_start = time.perf_counter()
    for _ in range(runs):
        pt_model(torch_input_tensor)
    return (time.perf_counter() - t_start) * 1000 / runs

benchmark_table_data_640 = []
if not calibration_images_640:
    raise ValueError("640x640 data collection failed.")
onnx_sample_input_np_640 = calibration_images_640[0].copy()
pytorch_sample_input_tensor_640 = torch.from_numpy(onnx_sample_input_np_640).float().cpu()

pt_filename_640 = f"yolo11{TARGET_MODEL_TAG_640}.pt"

onnx_fp32_filename_640 = f"yolo11{TARGET_MODEL_TAG_640}.onnx"
onnx_int8_filename_640 = f"yolo11{TARGET_MODEL_TAG_640}_int8_640.onnx"

print(f"\nProcessing 640x640: yolo11{TARGET_MODEL_TAG_640}")
latency_pt_fp32_640 = float('nan')
latency_onnx_fp32_640 = float('nan')
latency_onnx_int8_640 = float('nan')
onnx_input_name_str_640 = None

if Path(pt_filename_640).exists():
    try:
        loaded_pytorch_model_640 = YOLO(pt_filename_640).model.cpu().eval()
        latency_pt_fp32_640 = measure_latency_pytorch(loaded_pytorch_model_640, pytorch_sample_input_tensor_640)
    except Exception as e: print(f"  Error PT {pt_filename_640}: {e}")
else: print(f"  {pt_filename_640} not found.")

if Path(onnx_fp32_filename_640).exists():
    try:
        session_onnx_fp32_640 = ort.InferenceSession(onnx_fp32_filename_640, providers=["CPUExecutionProvider"])
        onnx_input_name_str_640 = session_onnx_fp32_640.get_inputs()[0].name
        latency_onnx_fp32_640 = measure_latency_onnx(session_onnx_fp32_640, {onnx_input_name_str_640: onnx_sample_input_np_640})
    except Exception as e: print(f"  Error ONNX FP32 {onnx_fp32_filename_640}: {e}")
else: print(f"  {onnx_fp32_filename_640} (640x640) not found. Ensure it was exported in Cell 2.")

if onnx_input_name_str_640 and not np.isnan(latency_onnx_fp32_640):
    if not Path(onnx_int8_filename_640).exists():
        try:
            calib_reader_int8_640 = CalibrationPicsReader(onnx_input_name_str_640, [img.copy() for img in calibration_images_640])
            quantize_static(
                model_input=onnx_fp32_filename_640, model_output=onnx_int8_filename_640,
                calibration_data_reader=calib_reader_int8_640,
                quant_format=QuantFormat.QOperator, activation_type=QuantType.QUInt8,
                weight_type=QuantType.QInt8, op_types_to_quantize=["Conv","MatMul","Add","Mul"]
            )
        except Exception as e: print(f"  Error quantizing {onnx_int8_filename_640}: {e}")
    if Path(onnx_int8_filename_640).exists():
        try:
            session_onnx_int8_640 = ort.InferenceSession(onnx_int8_filename_640, providers=["CPUExecutionProvider"])
            latency_onnx_int8_640 = measure_latency_onnx(session_onnx_int8_640, {onnx_input_name_str_640: onnx_sample_input_np_640})
        except Exception as e: print(f"  Error benchmarking ONNX INT8 {onnx_int8_filename_640}: {e}")
    else: print(f"  {onnx_int8_filename_640} not created.")

benchmark_table_data_640.append({
    'Model': f"11{TARGET_MODEL_TAG_640} (640)", 'PT FP32': latency_pt_fp32_640,
    'ONNX FP32': latency_onnx_fp32_640, 'ONNX INT8': latency_onnx_int8_640,
})

df_results_640 = pd.DataFrame(benchmark_table_data_640)
print("\nLatency summary (640x640 CPU, ms/image):")
print(tabulate(df_results_640, headers='keys', tablefmt='github', floatfmt=".2f", showindex=False))

In [None]:
import os, io, time, random, requests, numpy as np, pandas as pd
from pathlib import Path
from PIL import Image

import onnx, onnxruntime as ort
from onnxconverter_common.float16 import convert_float_to_float16
from onnxruntime.quantization import (
    quantize_static, CalibrationDataReader,
    QuantType, QuantFormat
)
from tabulate import tabulate

import torch
from ultralytics import YOLO

picsum = "https://picsum.photos/seed/{}/640/640.jpg"
calib = []
while len(calib) < 300:
    try:
        img_b = requests.get(picsum.format(random.randint(0, 99999)), timeout=8).content
        img   = Image.open(io.BytesIO(img_b)).convert("RGB")
        img_resized = img.resize((640, 640), Image.LANCZOS)
        arr   = np.asarray(img_resized, np.float32).transpose(2, 0, 1) / 255.0
        calib.append(arr[np.newaxis])
        if len(calib) % 50 == 0:
            print(f"   ...collected {len(calib)}")
    except Exception as e:
        pass

class Pics(CalibrationDataReader):
    def __init__(self, name, data): self.name, self.data = name, list(data)
    def get_next(self): return {self.name: self.data.pop(0)} if self.data else None
    def rewind(self):
        pass

def latency_onnx(sess, inp_name, sample_np, runs=100):
    sess.run(None, {inp_name: sample_np})
    t0 = time.perf_counter()
    for _ in range(runs):
        sess.run(None, {inp_name: sample_np})
    return (time.perf_counter() - t0) / runs * 1000

@torch.no_grad()
def latency_pytorch(model, sample_tensor, runs=100):
    model(sample_tensor)
    t0 = time.perf_counter()
    for _ in range(runs):
        model(sample_tensor)
    return (time.perf_counter() - t0) / runs * 1000

sizes  = ["n", "s", "m", "l", "x"]
table  = []

if not calib:
    raise ValueError("Calibration data is empty. Cannot proceed with benchmarking.")
sample_np_onnx = calib[0]
sample_torch_tensor = torch.from_numpy(sample_np_onnx).float().cpu()


for tag in sizes:
    pt_model_filename = f"yolo11{tag}.pt"
    onnx_fp32_filename = f"yolo11{tag}.onnx"
    onnx_fp16_filename = f"yolo11{tag}_fp16.onnx"
    onnx_int8_filename = f"yolo11{tag}_int8.onnx"
    print(f"\n── yolo11{tag} ──────────────────────────")

    ms_pt_fp32 = float('nan')
    ms_onnx_fp32 = float('nan')
    ms_onnx_fp16 = float('nan')
    ms_onnx_int8 = float('nan')

    if Path(pt_model_filename).exists():
        try:
            pytorch_model = YOLO(pt_model_filename).model.cpu().eval()
            ms_pt_fp32 = latency_pytorch(pytorch_model, sample_torch_tensor)
            print(f"  PT FP32 latency: {ms_pt_fp32:.2f} ms")
        except Exception as e:
            print(f"  Error benchmarking PyTorch model {pt_model_filename}: {e}")
    else:
        print(f"  {pt_model_filename} not found. Skipping PyTorch benchmark.")


    if not Path(onnx_fp32_filename).exists():
        print(f"  {onnx_fp32_filename} not found. Skipping ONNX benchmarks for yolo11{tag}.")
        table.append(dict(model=f"11{tag}",
                          pt_fp32=ms_pt_fp32,
                          onnx_fp32=ms_onnx_fp32, onnx_fp16=ms_onnx_fp16, onnx_int8=ms_onnx_int8,
                          f16x=float('nan'), i8x=float('nan')))
        continue

    try:
        sess_fp = ort.InferenceSession(onnx_fp32_filename, providers=["CPUExecutionProvider"])
        onnx_input_name = sess_fp.get_inputs()[0].name
        ms_onnx_fp32 = latency_onnx(sess_fp, onnx_input_name, sample_np_onnx)
        print(f"  ONNX FP32 latency: {ms_onnx_fp32:.2f} ms")
    except Exception as e:
        print(f"  Error loading/benchmarking ONNX FP32 model {onnx_fp32_filename}: {e}")

    try:
        if not Path(onnx_fp16_filename).exists():
            print(f"  Creating {onnx_fp16_filename}...")
            model_fp32_loaded = onnx.load(onnx_fp32_filename)
            model_fp16 = convert_float_to_float16(model_fp32_loaded, keep_io_types=True)
            onnx.save(model_fp16, onnx_fp16_filename)
        sess_f16 = ort.InferenceSession(onnx_fp16_filename, providers=["CPUExecutionProvider"])
        ms_onnx_fp16 = latency_onnx(sess_f16, onnx_input_name, sample_np_onnx) # Use same input name
        print(f"  ONNX FP16 latency: {ms_onnx_fp16:.2f} ms")
    except Exception as e:
        print(f"  Error creating/benchmarking ONNX FP16 model {onnx_fp16_filename}: {e}")


    calib_data_for_pics = [item.copy() for item in calib]

    try:
        if not Path(onnx_int8_filename).exists():
            print(f"  Quantizing to {onnx_int8_filename}...")
            quantize_static(
                model_input=onnx_fp32_filename,
                model_output=onnx_int8_filename,
                calibration_data_reader=Pics(onnx_input_name, calib_data_for_pics),
                activation_type=QuantType.QUInt8,
                weight_type=QuantType.QInt8,
                quant_format=QuantFormat.QOperator,
                op_types_to_quantize=["Conv", "MatMul", "Add", "Mul"]
            )
        sess_i8 = ort.InferenceSession(onnx_int8_filename, providers=["CPUExecutionProvider"])
        ms_onnx_int8 = latency_onnx(sess_i8, onnx_input_name, sample_np_onnx)
        print(f"  ONNX INT8 latency: {ms_onnx_int8:.2f} ms")
    except Exception as e:
        print(f"  Error quantizing/benchmarking ONNX INT8 model {onnx_int8_filename}: {e}")

    table.append(dict(model=f"11{tag}",
                      pt_fp32=ms_pt_fp32,
                      onnx_fp32=ms_onnx_fp32,
                      onnx_fp16=ms_onnx_fp16,
                      onnx_int8=ms_onnx_int8,
                      f16x=ms_onnx_fp32/ms_onnx_fp16 if ms_onnx_fp16 > 0 else float('nan'),
                      i8x=ms_onnx_fp32/ms_onnx_int8 if ms_onnx_int8 > 0 else float('nan')))

df = pd.DataFrame(table)
print("\n🕒  Latency summary (CPU, ms / image)")
print(tabulate(df,
               headers={"model":"Model",
                        "pt_fp32":"PT FP32",
                        "onnx_fp32":"ONNX FP32",
                        "onnx_fp16":"ONNX FP16",
                        "onnx_int8":"ONNX INT8"},
               floatfmt=".2f", showindex=False, tablefmt="github"))

📸  Collecting calibration images (target: 300)...
   ...collected 50
   ...collected 100
   ...collected 150
   ...collected 200
   ...collected 250
   ...collected 300
📸  Collected 300 calibration images.

── yolo11n ──────────────────────────
  PT FP32 latency: 269.71 ms
  yolo11n.onnx not found. Skipping ONNX benchmarks for yolo11n.

── yolo11s ──────────────────────────
  PT FP32 latency: 623.90 ms
  ONNX FP32 latency: 514.79 ms
  ONNX FP16 latency: 646.26 ms
  ONNX INT8 latency: 343.08 ms

── yolo11m ──────────────────────────
  PT FP32 latency: 1611.84 ms
  ONNX FP32 latency: 1443.63 ms
  ONNX FP16 latency: 1733.25 ms




  Quantizing to yolo11m_int8.onnx...




  ONNX INT8 latency: 900.38 ms

── yolo11l ──────────────────────────
  PT FP32 latency: 2038.66 ms
  ONNX FP32 latency: 1938.46 ms
  Creating yolo11l_fp16.onnx...


KeyboardInterrupt: 