<a href="https://colab.research.google.com/github/SANS-Surya-o/YOLOV5-optimization/blob/main/YOLO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision torchinfo # import torch
!git clone https://github.com/ultralytics/yolov5
!cd yolov5 && pip install -r requirements.txt
%cd yolov5

Cloning into 'yolov5'...
remote: Enumerating objects: 17372, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 17372 (delta 42), reused 20 (delta 20), pack-reused 17313 (from 3)[K
Receiving objects: 100% (17372/17372), 16.25 MiB | 17.28 MiB/s, done.
Resolving deltas: 100% (11910/11910), done.
/content/yolov5/yolov5


In [None]:
# import packages
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from PIL import Image
from pathlib import Path
from tqdm.notebook import tqdm
import cv2
import yaml
import subprocess
import re
from torchinfo import summary

# Add YOLOv5 to path
yolov5_path = 'yolov5'
sys.path.append(yolov5_path)


from yolov5.models.common import DetectMultiBackend
from yolov5.utils.torch_utils import select_device
from yolov5.utils.general import non_max_suppression, scale_boxes
from yolov5.utils.augmentations import letterbox

In [None]:
# COCO Dataset
def download_coco_subset():
    """Download a subset of COCO validation images"""
    os.makedirs('coco_subset', exist_ok=True)

    # Download COCO val2017 zip file
    !wget -q http://images.cocodataset.org/zips/val2017.zip -O coco_val.zip
    !unzip -q coco_val.zip -d temp_coco

    # Select first 100 images
    import shutil
    coco_images = list(Path('temp_coco/val2017').glob('*.jpg'))[:100]
    for i, img_path in enumerate(coco_images):
        shutil.copy(img_path, f'coco_subset/img_{i:03d}.jpg')

    # Clean up
    !rm -rf temp_coco coco_val.zip

    return 'coco_subset'


In [None]:
dataset_path = download_coco_subset()
# Get all image paths
image_paths = list(Path(dataset_path).glob('*.jpg')) + list(Path(dataset_path).glob('*.png'))
print(f"Using {len(image_paths)} images from {dataset_path}")

Using 100 images from coco_subset


In [None]:
# Load YOLOv5 Models
device = select_device('0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

YOLOv5 🚀 v7.0-411-gf4d8a84c Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)



Using device: cuda:0


# Task 1

In [None]:
model_variants = ['yolov5n', 'yolov5s', 'yolov5m', 'yolov5l', 'yolov5x']
results = []
model_benchmark_results = {}


for model in model_variants:
  print(f"Benchmarking {model}")

  # Run detection with timing
  start_time = time.time()
  cmd = f"python3 detect.py --weights {model}.pt --source {dataset_path} --device {device} --save-txt"
  result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
  output = result.stderr
  end_time = time.time()

  # Calculate FPS
  fps = len(image_paths) / (end_time - start_time)

  # Calculate latency - search for the string in the given format from the output of the command detect.py
  match = re.search(r"Speed:\s+([\d.]+)ms pre-process,\s+([\d.]+)ms inference,\s+([\d.]+)ms NMS", output)


  if match:
      pre = float(match.group(1))
      infer = float(match.group(2))
      nms = float(match.group(3))
      total_latency = round(pre + infer + nms, 2)
      results.append({
          "model": model,
          "latency": total_latency,
          "fps": fps
      })
      model_benchmark_results[model] = {
          "latency": total_latency,
          "fps": fps
      }
  else:
      print("Could not find timing info in output.")
      print(output)


# Create performance dataframe
benchmark_df = pd.DataFrame(results)
print("\nPerformance comparison:")
print(benchmark_df)



Benchmarking yolov5n
Benchmarking yolov5s
Benchmarking yolov5m
Benchmarking yolov5l
Benchmarking yolov5x

Performance comparison:
     model  latency        fps
0  yolov5n     15.4   9.935050
1  yolov5s     16.2  10.070236
2  yolov5m     21.6   9.407299
3  yolov5l     24.0   8.738181
4  yolov5x     27.3   8.572814


# Task 2

In [None]:
model_stats = []

# Used T4 GPU on google colab which has peak gflops of 8100
gpu_peak_gflops = 8100

for model_name in model_variants:
    # Load the model
    model = torch.hub.load('ultralytics/yolov5', model_name, pretrained=True)
    model.to(device)

    # Get model size in MB
    model_size_mb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 * 1024)

    # Get param count
    param_count = sum(p.numel() for p in model.parameters())

    # Use torchinfo to get stats
    stats = summary(model, input_size=(1, 3, 640, 640), verbose=0)

    # Extract FLOPS (multiply-accumulates)
    # For YOLOv5, each MAC operation is ~2 FLOPS
    macs = stats.total_mult_adds
    gflops = macs * 2 / 1e9

    latency_ms = benchmark_df[benchmark_df['model'] == model_name]['latency'].values[0]
    actual_gflops_per_sec = gflops / (latency_ms / 1000)

    utilization = (actual_gflops_per_sec / gpu_peak_gflops)

    if utilization == 1:
      bound_type = "compute"
    elif utilization < 1:
      bound_type = "memory"
    else:
      bound_type = "none"

    model_stats.append({
        'Model': model_name,
        'Parameters (M)': param_count / 1e6,
        'Model Size (MB)': model_size_mb,
        'GFLOPS per inference': gflops,
        'GFLOPS/sec': actual_gflops_per_sec,
        'Utilization (%)': utilization,
        'Bound Type': bound_type
    })


model_stats_df = pd.DataFrame(model_stats)
model_stats_df = model_stats_df.round(2)

Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to /root/.cache/torch/hub/master.zip
YOLOv5 🚀 v7.0-411-gf4d8a84c Python-3.11.11 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 v7.0-411-gf4d8a84c Python-3.11.11 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 v7.0-411-gf4d8a84c Python-3.11.11 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
YOLOv5m summary: 290 layers, 21172173 parameters, 0 gradients, 48.9 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
Using cache found in /root/.cache/torch/hub/ultralyti

In [None]:
# Print the results
print("The GPU used has a peak flop performance of 8.1 TFLOPS/s - (T4 GPU on google colab)")
print(model_stats_df)

The GPU used has a peak flop performance of 8.1 TFLOPS/s - (T4 GPU on google colab)
     Model  Parameters (M)  Model Size (MB)  GFLOPS per inference  GFLOPS/sec  \
0  yolov5n            1.87             7.12                  4.50      223.68   
1  yolov5s            7.23            27.56                 16.49     1084.55   
2  yolov5m           21.17            80.77                 48.97     2766.52   
3  yolov5l           46.53           177.51                109.15     3816.26   
4  yolov5x           86.71           330.75                205.67     4932.11   

   Utilization (%) Bound Type  
0             0.03     memory  
1             0.13     memory  
2             0.34     memory  
3             0.47     memory  
4             0.61     memory  


### Layer wise analysis

In [None]:
!pip install fvcore.nn

In [None]:
from fvcore.nn import FlopCountAnalysis
def analyze_model_layers(model_name):
    model = torch.hub.load('ultralytics/yolov5', model_name, pretrained=True, device='cpu')
    model.eval()
    params = sum(p.numel() for p in model.parameters())

    # Create sample input on CPU
    sample_input = torch.rand(1, 3, 640, 640, device='cpu')

    # Use fvcore for FLOP counting
    flop_counter = FlopCountAnalysis(model, sample_input)

    # Get total FLOPs
    total_flops = flop_counter.total()

    # Get per-module FLOP counts
    module_flops = flop_counter.by_module()

    # Get per-module-and-operator FLOP counts for more detailed analysis
    module_op_flops = flop_counter.by_module_and_operator()

    # Convert to a more analyzable format
    layer_data = []
    for module_name, flops in module_flops.items():
        # Skip the empty module name (represents the whole model)
        if module_name == '':
            continue

        # Get the module's percentage of total computation
        percentage = (flops / total_flops) * 100

        # Get operator breakdown if available
        op_breakdown = module_op_flops.get(module_name, {})

        layer_data.append({
            'Layer': module_name,
            'FLOPs': flops,
            'GFLOPs': flops / 1e9,
            'Percentage': percentage,
            'Op_Breakdown': op_breakdown,
        })

    # Sort by FLOPs in descending order
    layer_data = sorted(layer_data, key=lambda x: x['FLOPs'], reverse=True)

    # Measure CPU inference time
    start_time = time.time()
    with torch.no_grad():
        for _ in range(5):  # Run multiple times for more stable measurement
            _ = model(sample_input)
    end_time = time.time()
    inference_time = (end_time - start_time) / 5  # Average time per inference

    return {
        'model_name': model_name,
        'total_flops': total_flops,
        'total_gflops': total_flops / 1e9,
        'layer_data': layer_data,
        'inference_time': inference_time,
        'params': params,
    }

def print_top_layers(analysis_result, max_gflops):
    """
    Print details for all layers in the model (not just top N)
    """
    model_name = analysis_result['model_name']
    total_gflops = analysis_result['total_gflops']
    layer_data = analysis_result['layer_data']
    inference_time = analysis_result['inference_time']

    print(f"\n==== {model_name} Model Analysis (CPU) ====")
    print(f"Total GFLOPs: {total_gflops:.4f}")
    print(f"Inference Time: {inference_time*1000:.2f} ms")
    print(f"Throughput: {1/inference_time:.2f} FPS")
    print(f"Average Performance: {total_gflops/inference_time:.2f} GFLOPS/s")
    print(f"Utilization : {(total_gflops / (max_gflops*inference_time)) * 100:.2f}%")

    print(f"\nAll Layers Details:")
    print("-" * 120)
    print(f"{'Rank':<5} {'Layer':<40} {'GFLOPs':<12} {'Percentage':<10} {'Time(ms)':<12} {'GFLOPS/s':<12} {'Utilization':<12}")
    print("-" * 120)

    layer_data = sorted(layer_data, key=lambda x: x.get('Utilization', 0), reverse=True)

    for i, layer in enumerate(layer_data):
        utilization = layer.get('Utilization', 0)
        measured_time = layer.get('Measured_Time_ms', 0)
        gflops_per_second = layer.get('GFLOPS_per_second', 0)

        print(f"{i+1:<5} {layer['Layer']:<40} {layer['GFLOPs']:<12.4f} {layer['Percentage']:<10.2f}% "
              f"{measured_time:<12.2f} {gflops_per_second:<12.2f} {utilization:<12.2f}%")

def compute_utilization(analysis_result, peak_gflops):
    """
    Compute utilization for each layer based on direct inference time measurements
    """
    layer_data = analysis_result['layer_data']
    model = torch.hub.load('ultralytics/yolov5', analysis_result['model_name'], pretrained=True, device='cpu')
    model.eval()

    # Create sample input on CPU
    sample_input = torch.rand(1, 3, 640, 640, device='cpu')

    # Define a hook to measure layer execution time
    layer_times = {}

    def measure_time_hook(name):
        def hook(module, input, output):
            if name not in layer_times:
                layer_times[name] = []
            layer_times[name].append(time.time())
            return None
        return hook

    # Register hooks for all modules
    hooks = []
    for name, module in model.named_modules():
        if name != '':  # Skip the empty module name (represents the whole model)
            pre_hook = module.register_forward_pre_hook(
                lambda m, inp, name=name: layer_times.setdefault(name, []).append(time.time())
            )
            post_hook = module.register_forward_hook(measure_time_hook(name))
            hooks.append(pre_hook)
            hooks.append(post_hook)

    # Warm-up run
    with torch.no_grad():
        _ = model(sample_input)

    # Clear times from warm-up
    layer_times.clear()

    # Actual measurement runs
    with torch.no_grad():
        for _ in range(10):  # Run multiple times for more stable measurement
            _ = model(sample_input)

    # Remove hooks
    for hook in hooks:
        hook.remove()

    # Calculate average execution time for each layer
    layer_execution_times = {}
    for name, timestamps in layer_times.items():
        # Each layer should have pairs of timestamps (start and end)
        if len(timestamps) % 2 == 0:
            start_times = timestamps[::2]  # Every other element starting from 0
            end_times = timestamps[1::2]   # Every other element starting from 1

            # Calculate time differences and average
            time_diffs = [end - start for start, end in zip(start_times, end_times)]
            layer_execution_times[name] = sum(time_diffs) / len(time_diffs)

    # Update layer data with measured times
    for layer in layer_data:
        layer_name = layer['Layer']
        if layer_name in layer_execution_times:
            layer_time = layer_execution_times[layer_name]

            # Calculate actual GFLOPS/s for this layer
            if layer_time > 0:
                actual_gflops_per_second = layer['GFLOPs'] / layer_time
                # Calculate utilization percentage
                utilization = (actual_gflops_per_second / peak_gflops) * 100
            else:
                actual_gflops_per_second = 0
                utilization = 0

            # Add to the layer data
            layer['Measured_Time_ms'] = layer_time * 1000
            layer['GFLOPS_per_second'] = actual_gflops_per_second
            layer['Utilization'] = utilization
        else:
            # If layer not found in measurements, use zeros
            layer['Measured_Time_ms'] = 0
            layer['GFLOPS_per_second'] = 0
            layer['Utilization'] = 0

    return analysis_result

def run():
    # Estimate peak CPU performance
    print("Estimating peak CPU performance...")
    peak_gflops = 300  # Example value, adjust based on your CPU
    print(f"Estimated peak CPU performance: {peak_gflops:.2f} GFLOPS")

    # Analyze different YOLOv5 variants
    model_variants = ['yolov5x']
    analysis_results = []

    for variant in model_variants:
        print(f"Analyzing {variant} on CPU...")
        result = analyze_model_layers(variant)

        # Compute utilization based on measured inference time and estimated peak performance
        result = compute_utilization(result, peak_gflops)

        analysis_results.append(result)
        print_top_layers(result, peak_gflops)

run()

# Task 3

### Torch.profiler (used for CPU / GPU split)

In [None]:
import torch
import torchvision
from torch.profiler import profile, record_function, ProfilerActivity

dummy_input = torch.randn((1, 3, 640, 640)).to(device)
# model = DetectMultiBackend('yolov5s.pt', device=device).to(device).eval()
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(device).eval()


with profile(
    activities=[
        ProfilerActivity.CPU,
        ProfilerActivity.CUDA] if torch.cuda.is_available() else [ProfilerActivity.CPU],
    record_shapes=True,
    with_stack=True,
    profile_memory=True
) as prof:
    with torch.no_grad():
        with record_function("model_inference"):
            model(dummy_input)

# Get profiler output as string
prof_output = prof.key_averages().table(sort_by="cpu_time_total", row_limit=25)

print(prof_output)


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 v7.0-411-gf4d8a84c Python-3.11.11 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        42.32%      13.154ms        99.94%      31.064ms      31.064ms       0.000us         0.00%      11.566ms      11.566ms           0 b           0 b     -92.50 Kb    -195.08 M

### torch.utils.bottleneck (used for memory and threading issues)

In [None]:
!python -m torch.utils.bottleneck detect.py --weights yolov5s.pt --source data/images/zidane.jpg --device 0 --save-txt



`bottleneck` is a tool that can be used as an initial step for debugging
bottlenecks in your program.

It summarizes runs of your script with the Python profiler and PyTorch's
autograd profiler. Because your script will be profiled, please ensure that it
exits in a finite amount of time.

For more complicated uses of the profilers, please see
https://docs.python.org/3/library/profile.html and
https://pytorch.org/docs/main/autograd.html#profiler for more information.
Running environment analysis...
Running your script with cProfile
[34m[1mdetect: [0mweights=['yolov5s.pt'], source=yolov5/data/images/zidane.jpg, data=yolov5/data/coco128.yaml, imgsz=[640, 640], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=0, view_img=False, save_txt=True, save_format=0, save_csv=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=yolov5/runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=

In [None]:
!python -m torch.utils.bottleneck detect.py --weights yolov5s.pt --source data/images/zidane.jpg --device 0 --save-txt

Also adding yolov5 to every path is getting tedious. I'll cd into that directory


In [None]:
%cd yolov5

/content/yolov5


In [None]:
!nvidia-smi

Fri Apr 11 12:42:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Line Profiler for time spent on each pipeline stage

In [None]:
# INstall Line profiler
!pip install line_profiler
%load_ext line_profiler



In [None]:
# Adds profiling directives to detect.py
!sed -i 's/^def run/@profile\ndef run/' detect.py

In [None]:
!kernprof -l detect.py --weights yolov5s.pt --source data/images/zidane.jpg
!python -m line_profiler detect.py.lprof

[34m[1mdetect: [0mweights=['yolov5s.pt'], source=data/images/zidane.jpg, data=data/coco128.yaml, imgsz=[640, 640], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=False, save_format=0, save_csv=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False, vid_stride=1
YOLOv5 🚀 v7.0-411-gf4d8a84c Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
image 1/1 /content/yolov5/data/images/zidane.jpg: 384x640 2 persons, 2 ties, 30.7ms
Speed: 0.5ms pre-process, 30.7ms inference, 135.1ms NMS per image at shape (1, 3, 640, 640)
Results saved to [1mruns/detect/exp4[0m
Wrote profile results to detect.py.lprof
Inspect results with:
python3 -m line_profiler -rmt "dete

Clearly, loading the model is taking up too much time. Modify detect.py directly to print the times for each stage. The modification was done on my device and the output obtained is displayed in the cell below.

# NOTE:
Kindly look at the saved output of the cell below before running it as it was obtained after a modification to a file on my device. Running it again without changing the `detect.py` file accordinly will not print the time percentage details..


In [None]:
# !kernprof -l detect.py --weights yolov5s.pt --source data/images/zidane.jpg
# !python -m line_profiler detect.py.lprof
!python3 detect.py --weights yolov5s.pt --source data/images/zidane.jpg --save-txt

[34m[1mdetect: [0mweights=['yolov5s.pt'], source=data/images/zidane.jpg, data=data/coco128.yaml, imgsz=[640, 640], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=True, save_format=0, save_csv=False, save_conf=False, save_crop=False, nosave=False, classes=None, agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False, vid_stride=1
YOLOv5 🚀 v7.0-411-gf4d8a84c Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
image 1/1 /content/yolov5/data/images/zidane.jpg: 384x640 2 persons, 2 ties, 31.5ms

--- Pipeline Breakdown (after model load) ---
Pre-processing:  0.20%
Inference:       14.51%
Post-processing: 85.29%
Speed: 0.4ms pre-process, 31.5ms inference, 185.2ms NMS per image at shape (1, 3, 640, 640)
Results saved to [1mru

# Task 4

In [None]:
def benchmark(model):
  # Run detection with timing
  model = model.strip()
  if not os.path.exists(model):
        raise FileNotFoundError(f"Model file {model} not found")
  start_time = time.time()
  cmd = f"python3 detect.py --weights {model} --source coco_subset --device {device} --save-txt"
  result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
  output = result.stderr
  end_time = time.time()

  # Calculate FPS
  fps = 100 / (end_time - start_time)
  # print(output)
  # Calculate latency - search for the string in the given format from the output of the command detect.py
  match = re.search(r"Speed:\s+([\d.]+)ms pre-process,\s+([\d.]+)ms inference,\s+([\d.]+)ms NMS", output)
  if match:
      pre = float(match.group(1))
      infer = float(match.group(2))
      nms = float(match.group(3))
      total_latency = round(pre + infer + nms, 2)
      return {
          "model": model,
          "latency": total_latency,
          "fps": fps
      }
  else:
    print("Something wrong")



### Approach 1: Change to FP16

In [None]:
# FP32 s
print(benchmark('yolov5s.pt'))

{'model': 'yolov5s.pt', 'latency': 16.0, 'fps': 9.979759735068718}


In [None]:
# Modify Source code to use FP16

#FP16 s
print(benchmark('yolov5s.pt'))

{'model': 'yolov5s.pt', 'latency': 15.0, 'fps': 10.27355132176902}


In [None]:
# FP16 X
print(benchmark('yolov5x.pt'))

{'model': 'yolov5x.pt', 'latency': 44.0, 'fps': 7.551323448153921}


In [None]:
#FP32 X
print(benchmark('yolov5x.pt'))

{'model': 'yolov5x.pt', 'latency': 27.6, 'fps': 8.411387551539327}


### Approach 2: Batch Processing



In [None]:
# yolov5s Batch size: 1
print(benchmark('yolov5s.pt'))

{'model': 'yolov5s.pt', 'latency': 13.6, 'fps': 11.033102854472979}


In [None]:
# Modify source code
# yolov5s Batch Size: 32
print(benchmark('yolov5s.pt'))

{'model': 'yolov5s.pt', 'latency': 13.5, 'fps': 11.21276663544928}


In [None]:
# yolov5s Batch size 100
print(benchmark('yolov5s.pt'))

{'model': 'yolov5s.pt', 'latency': 13.4, 'fps': 11.534726277463598}


In [None]:
# yolov5x Batch size 1
print(benchmark('yolov5x.pt'))

{'model': 'yolov5x.pt', 'latency': 41.2, 'fps': 7.6264871867664255}


In [None]:
# yolov5x Batch size 32
print(benchmark('yolov5x.pt'))

{'model': 'yolov5x.pt', 'latency': 41.0, 'fps': 7.659538643945381}


In [None]:
# yolov5x Batch size 100
print(benchmark('yolov5x'))

{'model': 'yolov5x', 'latency': 44.0, 'fps': 7.343453094493132}


### Custom script to bypass detect.py and directly measure inference time

In [None]:
from pathlib import Path
from torchvision import transforms

# Load model once and reuse
model = torch.hub.load('ultralytics/yolov5', 'yolov5x', device=device)
model.eval().half()

# Preprocessing
transform = transforms.Compose([
    transforms.Resize((640, 640)),
    transforms.ToTensor(),
])

# Load all image paths once (modify path if needed)
image_dir = 'coco_subset'  # Change to your folder
image_paths = list(Path(image_dir).glob('*.jpg'))

# Define batched benchmark function
def benchmark_batch(batch_size):
    if batch_size > len(image_paths):
        print(f"Only {len(image_paths)} images available, reducing batch size.")
        batch_size = len(image_paths)

    imgs = []
    for path in image_paths[:batch_size]:
        img = Image.open(path).convert('RGB')
        img = transform(img)
        imgs.append(img)

    batch = torch.stack(imgs).to('cuda').half()

    # Inference and timing
    start = time.time()
    with torch.no_grad():
        _ = model(batch)
    end = time.time()

    latency = (end - start) * 1000  # ms
    fps = batch_size / (end - start)

    return {
        'batch_size': batch_size,
        'latency (ms)': round(latency, 2),
        'fps': round(fps, 2)
    }

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 v7.0-411-gf4d8a84c Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)

Fusing layers... 
YOLOv5x summary: 444 layers, 86705005 parameters, 0 gradients, 205.5 GFLOPs
Adding AutoShape... 


In [None]:
# yolov5s
print(benchmark_batch(1))
print(benchmark_batch(16))
print(benchmark_batch(32))
print(benchmark_batch(64))
print(benchmark_batch(100))

  with amp.autocast(autocast):
  with amp.autocast(autocast):


{'batch_size': 1, 'latency (ms)': 8.96, 'fps': 111.57}
{'batch_size': 16, 'latency (ms)': 7.75, 'fps': 2065.27}


  with amp.autocast(autocast):


{'batch_size': 32, 'latency (ms)': 7.85, 'fps': 4074.12}


  with amp.autocast(autocast):


{'batch_size': 64, 'latency (ms)': 8.2, 'fps': 7801.54}
{'batch_size': 100, 'latency (ms)': 8.12, 'fps': 12320.61}


  with amp.autocast(autocast):


In [None]:
# Modify the benchmark_batch() to change the model from yolov5s to yolov5x

# yolov5x
print(benchmark_batch(1))
print(benchmark_batch(16))
print(benchmark_batch(32))
print(benchmark_batch(64))
print(benchmark_batch(100))

  with amp.autocast(autocast):
  with amp.autocast(autocast):


{'batch_size': 1, 'latency (ms)': 15.26, 'fps': 65.55}
{'batch_size': 16, 'latency (ms)': 15.27, 'fps': 1047.74}


  with amp.autocast(autocast):


{'batch_size': 32, 'latency (ms)': 13.74, 'fps': 2328.23}


  with amp.autocast(autocast):


{'batch_size': 64, 'latency (ms)': 13.84, 'fps': 4623.73}
{'batch_size': 100, 'latency (ms)': 13.77, 'fps': 7261.48}


  with amp.autocast(autocast):


### Approach 3: ONNX/TensorRT

In [None]:
!pip install onnxruntime-gpu

Collecting onnxruntime-gpu
  Using cached onnxruntime_gpu-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Using cached onnxruntime_gpu-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (280.8 MB)
Installing collected packages: onnxruntime-gpu
Successfully installed onnxruntime-gpu-1.21.0


In [None]:
def benchmark_onnx(model):
  # Run detection with timing
  model = model.strip()
  # if not os.path.exists(model):
  #       raise FileNotFoundError(f"Model file {model} not found")
  start_time = time.time()
  cmd = f"python3 detect.py --weights {model} --source coco_subset --device {device} --half --save-txt"
  result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
  output = result.stderr
  end_time = time.time()
  print(output)
  # Calculate FPS
  fps = 100 / (end_time - start_time)
  # print(output)
  # Calculate latency - search for the string in the given format from the output of the command detect.py
  match = re.search(r"Speed:\s+([\d.]+)ms pre-process,\s+([\d.]+)ms inference,\s+([\d.]+)ms NMS", output)
  if match:
      pre = float(match.group(1))
      infer = float(match.group(2))
      nms = float(match.group(3))
      total_latency = round(pre + infer + nms, 2)
      return {
          "model": model,
          "latency": total_latency,
          "fps": fps
      }
  else:
    print("Something wrong")



In [None]:
# Modify detect.py here to include onnx
# YOLOV5s
# With onnx
print(benchmark_onnx('yolov5s.onnx'))

In [None]:
# YOLOV5s
# Without ONNX
print(benchmark('yolov5s.onnx'))

{'model': 'yolov5s.onnx', 'latency': 228.8, 'fps': 3.2865758685683084}


In [None]:
# YOLOV5x
# Without ONNX
print(benchmark('yolov5x.pt'))

{'model': 'yolov5x.pt', 'latency': 41.7, 'fps': 7.83334640975327}


In [None]:
# YOLOV5x
# With ONNX
print(benchmark('yolov5x.onnx'))

{'model': 'yolov5x.onnx', 'latency': 2129.1, 'fps': 0.44623302141253224}
