# Q1

In [6]:
# Import necessary libraries
import os
import time
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
from tqdm import tqdm



import sys
sys.path.append('./yolov5')

# Import YOLOv5 modules
from models.experimental import attempt_load
from utils.general import check_img_size, non_max_suppression
from utils.torch_utils import select_device
from utils.augmentations import letterbox

In [7]:
# Set device (GPU if available, otherwise CPU)
device = select_device('0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Download and load all YOLOv5 model variants
models = {}
model_names = ['yolov5n', 'yolov5s', 'yolov5m', 'yolov5l', 'yolov5x']

for model_name in model_names:
    model_path = f'yolov5/weights/{model_name}.pt'
    
    # Download model if it doesn't exist
    if not os.path.exists(model_path):
        os.makedirs('yolov5/weights', exist_ok=True)
        !wget -P yolov5/weights/ https://github.com/ultralytics/yolov5/releases/download/v6.2/{model_name}.pt
    
    # Load model
    model = attempt_load(model_path, device=device)
    model.eval()  # Set to evaluation mode
    models[model_name] = model
    print(f"Loaded {model_name}")

fatal: cannot change to '/home/ved_maurya/sem4/Software': No such file or directory
YOLOv5 🚀 2025-4-9 Python-3.12.7 torch-2.6.0+cu124 CPU



Using device: cpu


Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Fusing layers... 


Loaded yolov5n


YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs


Loaded yolov5s


Fusing layers... 
YOLOv5m summary: 290 layers, 21172173 parameters, 0 gradients, 48.9 GFLOPs


Loaded yolov5m


Fusing layers... 
YOLOv5l summary: 367 layers, 46533693 parameters, 0 gradients, 109.0 GFLOPs


Loaded yolov5l


Fusing layers... 
YOLOv5x summary: 444 layers, 86705005 parameters, 0 gradients, 205.5 GFLOPs


Loaded yolov5x


In [107]:


# Get list of test images
test_images = [os.path.join('test_images', f) for f in os.listdir('test_images') 
               if f.endswith(('.jpg', '.jpeg', '.png'))]

# Make sure we have at least 100 images
if len(test_images) < 100:
    print(f"Warning: Only {len(test_images)} images found. The assignment requires at least 100 images.")
    # If you don't have enough images, you can duplicate the existing ones
    while len(test_images) < 100:
        test_images.append(test_images[len(test_images) % len(test_images)])

# Limit to exactly 100 images
test_images = test_images[:100]
print(f"Using {len(test_images)} test images")

Using 100 test images


In [8]:
# Helper functions for inference
from PIL import Image
import numpy as np
import torch
from yolov5.utils.augmentations import letterbox

def preprocess_image(img_path, img_size=640, device='cpu', half=False):
    """
    Preprocess an image for YOLOv5 inference.

    Args:
        img_path (str): Path to the image file.
        img_size (int): Target size for YOLOv5 model input.
        device (str): Device to move the image to ("cpu" or "cuda").
        half (bool): Whether to convert to float16.

    Returns:
        torch.Tensor: Preprocessed image tensor of shape (1, 3, H, W)
    """
    # Load image as RGB
    img = Image.open(img_path).convert('RGB')

    # Convert to NumPy array
    img = np.array(img)

    # Resize and pad
    img = letterbox(img, new_shape=img_size)[0]

    # HWC to CHW
    img = img.transpose(2, 0, 1)

    # Normalize and convert to tensor
    img = np.ascontiguousarray(img, dtype=np.float32) / 255.0
    img = torch.from_numpy(img).unsqueeze(0).to(device)

    if half:
        img = img.half()

    return img


def run_inference(model, img_tensor, conf_thres=0.25, iou_thres=0.45):
    """
    Run YOLOv5 inference and apply non-max suppression.

    Args:
        model (torch.nn.Module): The YOLOv5 model.
        img_tensor (torch.Tensor): Preprocessed input image (1, 3, H, W).
        conf_thres (float): Confidence threshold.
        iou_thres (float): IoU threshold for NMS.

    Returns:
        list: Detections after NMS.
    """
    with torch.no_grad():
        # Forward pass
        output = model(img_tensor)[0]  # [0] is needed for YOLOv5-style models

        # Apply NMS
        detections = non_max_suppression(output, conf_thres=conf_thres, iou_thres=iou_thres)

    return detections

In [9]:
def benchmark_model(model_name, model, image_paths, num_runs=1):
    """
    Benchmark a model on multiple images
    
    Args:
        model_name: Name of the model
        model: PyTorch model
        image_paths: List of paths to test images
        num_runs: Number of benchmark runs
        
    Returns:
        Average latency per image (ms)
        Throughput (FPS)
    """
    img_size = 640  # Default image size for YOLOv5
    latencies = []
    
    # Warmup runs (important for accurate benchmarking)
    print(f"Warming up {model_name}...")
    for _ in range(5):
        img = preprocess_image(image_paths[0], img_size)
        _ = run_inference(model, img)
    
    # Perform benchmark runs
    for run in range(num_runs):
        print(f"Benchmark run {run+1}/{num_runs} for {model_name}")
        
        start_time = time.time()
        for img_path in tqdm(image_paths, desc=f"Processing images"):
            img = preprocess_image(img_path, img_size)
            _ = run_inference(model, img)
        end_time = time.time()
        
        # Calculate metrics
        total_time = end_time - start_time
        avg_latency_ms = (total_time * 1000) / len(image_paths)  # Convert to milliseconds
        throughput_fps = len(image_paths) / total_time  # Frames per second
        
        latencies.append((avg_latency_ms, throughput_fps))
        
        print(f"Run {run+1}: Latency = {avg_latency_ms:.2f} ms, FPS = {throughput_fps:.2f}")
    
    # Calculate average results across runs
    avg_latency = np.mean([l[0] for l in latencies])
    avg_fps = np.mean([l[1] for l in latencies])
    
    return avg_latency, avg_fps

In [110]:
# Run benchmarks for all models
results = {}

for model_name, model in models.items():
    print(f"\n===== Benchmarking {model_name} =====")
    latency, fps = benchmark_model(model_name, model, test_images)
    results[model_name] = {'Latency (ms)': latency, 'FPS': fps}
    print(f"\nFinal results for {model_name}:")
    print(f"Average Latency: {latency:.2f} ms")
    print(f"Average Throughput: {fps:.2f} FPS")
    
# Create result table as required by the assignment
results_df = pd.DataFrame.from_dict(results, orient='index')
print("\n===== Benchmark Results =====")
print(results_df)

# Save results to CSV
results_df.to_csv('yolov5_benchmark_results.csv')

# Visualize results
plt.figure(figsize=(12, 5))

# Plot latency
plt.subplot(1, 2, 1)
plt.bar(results_df.index, results_df['Latency (ms)'])
plt.title('Inference Latency (ms)')
plt.ylabel('Milliseconds')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Plot throughput
plt.subplot(1, 2, 2)
plt.bar(results_df.index, results_df['FPS'])
plt.title('Throughput (FPS)')
plt.ylabel('Frames Per Second')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig('yolov5_benchmark_results.png')
plt.show()


===== Benchmarking yolov5n =====
Warming up yolov5n...
Benchmark run 1/1 for yolov5n


Processing images: 100%|██████████| 100/100 [00:05<00:00, 18.78it/s]


Run 1: Latency = 53.27 ms, FPS = 18.77

Final results for yolov5n:
Average Latency: 53.27 ms
Average Throughput: 18.77 FPS

===== Benchmarking yolov5s =====
Warming up yolov5s...
Benchmark run 1/1 for yolov5s


Processing images: 100%|██████████| 100/100 [00:11<00:00,  8.78it/s]


Run 1: Latency = 113.95 ms, FPS = 8.78

Final results for yolov5s:
Average Latency: 113.95 ms
Average Throughput: 8.78 FPS

===== Benchmarking yolov5m =====
Warming up yolov5m...
Benchmark run 1/1 for yolov5m


Processing images: 100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


Run 1: Latency = 297.69 ms, FPS = 3.36

Final results for yolov5m:
Average Latency: 297.69 ms
Average Throughput: 3.36 FPS

===== Benchmarking yolov5l =====
Warming up yolov5l...
Benchmark run 1/1 for yolov5l


Processing images: 100%|██████████| 100/100 [00:59<00:00,  1.69it/s]


Run 1: Latency = 591.16 ms, FPS = 1.69

Final results for yolov5l:
Average Latency: 591.16 ms
Average Throughput: 1.69 FPS

===== Benchmarking yolov5x =====
Warming up yolov5x...
Benchmark run 1/1 for yolov5x


Processing images: 100%|██████████| 100/100 [01:44<00:00,  1.04s/it]


Run 1: Latency = 1044.80 ms, FPS = 0.96

Final results for yolov5x:
Average Latency: 1044.80 ms
Average Throughput: 0.96 FPS

===== Benchmark Results =====
         Latency (ms)        FPS
yolov5n     53.272398  18.771447
yolov5s    113.950291   8.775756
yolov5m    297.689378   3.359206
yolov5l    591.162312   1.691583
yolov5x   1044.802024   0.957119


In [13]:
# Function to visualize detection on an image
def visualize_detection(img_path, model, model_name):
    import cv2
    
    # Preprocess image
    img = preprocess_image(img_path).half()  # Convert input to FP16
    orig_img = cv2.imread(img_path)
    orig_img = cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB)
    
    # Run inference
    pred = run_inference(model, img)
    
    # Draw bounding boxes
    if len(pred[0]) > 0:
        for *xyxy, conf, cls in pred[0]:
            # Convert tensors to integers
            x1, y1, x2, y2 = [int(coord) for coord in xyxy]
            
            # Draw rectangle
            cv2.rectangle(orig_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
            
            # Add label
            label = f"{conf:.2f}"
            cv2.putText(orig_img, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    # Display image
    plt.figure(figsize=(10, 8))
    plt.imshow(orig_img)
    plt.title(f"{model_name} Detection Example")
    plt.axis('off')
    plt.show()

# Visualize a sample detection with each model
sample_img = test_images[0]
for model_name, model in models.items():
    visualize_detection(sample_img, model, model_name)

RuntimeError: Input type (c10::Half) and bias type (float) should be the same

# Q2


In [None]:
# Part 1: Calculate model parameters, size, and GFLOPs

import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import time


from torchinfo import summary

# Import YOLOv5 modules (assuming you've already run your first notebook)
sys.path.append('./yolov5')
from models.experimental import attempt_load
from utils.general import check_img_size
from utils.torch_utils import select_device

# Set device
device = select_device('0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load models and collect model information
models = {}
model_names = ['yolov5n', 'yolov5s', 'yolov5m', 'yolov5l', 'yolov5x']
model_info = {}

def get_model_size_mb(model):
    """Calculate model size in MB"""
    # Save model to a temporary file to get its size
    temp_path = 'temp_model.pt'
    torch.save(model.state_dict(), temp_path)
    size_bytes = os.path.getsize(temp_path)
    size_mb = size_bytes / (1024 * 1024)  # Convert bytes to MB
    os.remove(temp_path)  # Clean up
    return size_mb

for model_name in model_names:
    model_path = f'yolov5/weights/{model_name}.pt'
    
    # Load model if not already done
    if not os.path.exists(model_path):
        os.makedirs('yolov5/weights', exist_ok=True)
        !wget -P yolov5/weights/ https://github.com/ultralytics/yolov5/releases/download/v6.2/{model_name}.pt
    
    # Load model
    model = attempt_load(model_path, device=device)
    model.eval()  # Set to evaluation mode
    models[model_name] = model
    
    # Get model info
    print(f"\n===== Model Information for {model_name} =====")
    input_size = (1, 3, 640, 640)  # Standard YOLOv5 input size
    model_summary = summary(model, input_size=input_size, verbose=0)
    
    # Extract information
    total_params = model_summary.total_params
    model_size = get_model_size_mb(model)
    
    # The macs (multiply-accumulate operations) need to be converted to FLOPS
    # 1 mac = 2 FLOPS (1 multiplication + 1 addition)
    # and then convert to GFLOPS (divide by 10^9)
    gflops = model_summary.total_mult_adds * 2 / 1e9
    
    model_info[model_name] = {
        'Total Parameters': total_params,
        'Model Size (MB)': model_size,
        'GFLOPS per inference': gflops
    }
    
    print(f"Total Parameters: {total_params:,}")
    print(f"Model Size: {model_size:.2f} MB")
    print(f"GFLOPS per inference: {gflops:.2f}")

# Create a table with model information
info_df = pd.DataFrame.from_dict(model_info, orient='index')
print("\n===== Model Information =====")
print(info_df)

# Save model information to CSV
info_df.to_csv('yolov5_model_info.csv')

# Part 2: Determine if models are compute-bound or memory-bound

# First, let's rerun the benchmark to measure actual GFLOPS/sec
# Load the test images from the previous run
test_images = [os.path.join('test_images', f) for f in os.listdir('test_images') 
               if f.endswith(('.jpg', '.jpeg', '.png'))]
test_images = test_images[:100]  # Limit to 100 images

# Reuse preprocessing and inference functions from the previous notebook
def preprocess_image(img_path, img_size=640):
    """Preprocess an image for YOLOv5 inference"""
    # Load and convert to RGB
    img = Image.open(img_path).convert('RGB')
    
    # Convert to numpy array first
    img = np.array(img)
    
    # Resize and pad
    img = letterbox(img, img_size, stride=32)[0]
    
    # Convert from HWC to CHW format (height, width, channels) -> (channels, height, width)
    img = img.transpose(2, 0, 1)
    
    # Make contiguous in memory
    img = np.ascontiguousarray(img)
    
    # Convert to PyTorch tensor and move to device
    img = torch.from_numpy(img).to(device)
    
    # Normalize pixel values to 0-1
    img = img.float() / 255.0
    
    # Add batch dimension if needed
    if img.ndimension() == 3:
        img = img.unsqueeze(0)
        
    return img

def run_inference(model, img):
    """Run inference with a preprocessed image"""
    with torch.no_grad():  # Disable gradient calculation
        # Forward pass through model
        pred = model(img)[0]
        
        # Apply non-maximum suppression to filter detections
        pred = non_max_suppression(pred)
        
    return pred

# Benchmark function that measures GFLOPS/sec
def benchmark_model_gflops(model_name, model, gflops_per_inference, image_paths, num_runs=1):
    """
    Benchmark a model and calculate GFLOPS/sec
    
    Args:
        model_name: Name of the model
        model: PyTorch model
        gflops_per_inference: GFLOPS required for one inference
        image_paths: List of paths to test images
        num_runs: Number of benchmark runs
        
    Returns:
        Average GFLOPS/sec
    """
    img_size = 640  # Default image size for YOLOv5
    gflops_per_sec_list = []
    
    # Warmup runs
    print(f"Warming up {model_name}...")
    for _ in range(5):
        img = preprocess_image(image_paths[0], img_size)
        _ = run_inference(model, img)
    
    # Perform benchmark runs
    for run in range(num_runs):
        print(f"Benchmark run {run+1}/{num_runs} for {model_name}")
        
        start_time = time.time()
        for img_path in tqdm(image_paths, desc=f"Processing images"):
            img = preprocess_image(img_path, img_size)
            _ = run_inference(model, img)
        end_time = time.time()
        
        # Calculate metrics
        total_time = end_time - start_time
        total_gflops = gflops_per_inference * len(image_paths)
        gflops_per_sec = total_gflops / total_time
        
        gflops_per_sec_list.append(gflops_per_sec)
        
        print(f"Run {run+1}: GFLOPS/sec = {gflops_per_sec:.2f}")
    
    # Calculate average results across runs
    avg_gflops_per_sec = np.mean(gflops_per_sec_list)
    
    return avg_gflops_per_sec

# Estimate peak GFLOPS of the hardware
def estimate_peak_gflops():
    """Estimate peak GFLOPS based on simple matrix multiplication benchmark"""
    # For CPU, this is a rough estimate
    print("Estimating peak GFLOPS of the hardware...")
    
    # Create large matrices for multiplication
    matrix_size = 2000
    a = torch.randn(matrix_size, matrix_size, device=device)
    b = torch.randn(matrix_size, matrix_size, device=device)
    
    # Warmup
    for _ in range(3):
        _ = torch.matmul(a, b)
    
    # Benchmark
    n_iter = 5
    start_time = time.time()
    for _ in range(n_iter):
        _ = torch.matmul(a, b)
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    end_time = time.time()
    
    # Calculate GFLOPS (matrix multiplication is 2*n^3 operations)
    elapsed_time = end_time - start_time
    operations = 2 * matrix_size**3 * n_iter
    peak_gflops = operations / elapsed_time / 1e9
    
    print(f"Estimated peak GFLOPS: {peak_gflops:.2f}")
    return peak_gflops

# Measure peak memory bandwidth
def estimate_peak_memory_bandwidth():
    """Estimate peak memory bandwidth in GB/s"""
    print("Estimating peak memory bandwidth...")
    
    # Create large arrays for memory bandwidth test
    array_size = 10**8  # 100M elements
    a = torch.ones(array_size, device=device)
    b = torch.ones(array_size, device=device)
    c = torch.zeros(array_size, device=device)
    
    # Warmup
    for _ in range(3):
        c = a + b
    
    # Benchmark
    n_iter = 10
    start_time = time.time()
    for _ in range(n_iter):
        c = a + b
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    end_time = time.time()
    
    # Calculate bandwidth (bytes read + bytes written per second)
    elapsed_time = end_time - start_time
    bytes_processed = (2 * a.element_size() + c.element_size()) * array_size * n_iter
    bandwidth_gb_s = bytes_processed / elapsed_time / 1e9
    
    print(f"Estimated peak memory bandwidth: {bandwidth_gb_s:.2f} GB/s")
    return bandwidth_gb_s

# Determine whether the model is compute-bound or memory-bound
# Get peak performance numbers
peak_gflops = estimate_peak_gflops()
peak_bandwidth = estimate_peak_memory_bandwidth()

# Initialize results dictionary
roofline_results = {}

for model_name, model in models.items():
    print(f"\n===== Roofline Analysis for {model_name} =====")
    
    # Get model's GFLOPS per inference
    gflops_per_inference = model_info[model_name]['GFLOPS per inference']
    
    # Get model's GFLOPS/sec from benchmarking
    actual_gflops_per_sec = benchmark_model_gflops(
        model_name, model, gflops_per_inference, test_images
    )
    
    # Calculate operational intensity (FLOPS/Byte)
    # We'll estimate this as GFLOPS / model size in GB
    model_size_gb = model_info[model_name]['Model Size (MB)'] / 1024
    operational_intensity = gflops_per_inference / model_size_gb
    
    # Calculate peak performance based on operational intensity
    # If it's memory-bound, peak = operational_intensity * peak_bandwidth
    # If it's compute-bound, peak = peak_gflops
    compute_bound_peak = peak_gflops
    memory_bound_peak = operational_intensity * peak_bandwidth
    theoretical_peak = min(compute_bound_peak, memory_bound_peak)
    
    # Calculate utilization
    utilization = actual_gflops_per_sec / peak_gflops * 100  # as percentage
    
    # Determine if the model is compute-bound or memory-bound
    is_compute_bound = compute_bound_peak <= memory_bound_peak
    bound_type = "Compute-bound" if is_compute_bound else "Memory-bound"
    
    # Store the results
    roofline_results[model_name] = {
        'GFLOPS per inference': gflops_per_inference,
        'Actual GFLOPS/sec': actual_gflops_per_sec,
        'Operational Intensity (FLOPS/Byte)': operational_intensity,
        'Bound Type': bound_type,
        'Peak GFLOPS Utilization (%)': utilization
    }
    
    print(f"GFLOPS per inference: {gflops_per_inference:.2f}")
    print(f"Actual GFLOPS/sec: {actual_gflops_per_sec:.2f}")
    print(f"Operational Intensity: {operational_intensity:.2f} FLOPS/Byte")
    print(f"Bound Type: {bound_type}")
    print(f"Peak GFLOPS Utilization: {utilization:.2f}%")

# Create a table with roofline analysis results
roofline_df = pd.DataFrame.from_dict(roofline_results, orient='index')
print("\n===== Roofline Analysis Results =====")
print(roofline_df)

# Save roofline analysis to CSV
roofline_df.to_csv('yolov5_roofline_analysis.csv')

# Visualize roofline model
plt.figure(figsize=(12, 8))

# Plot the roofline
x_range = np.logspace(-1, 3, 100)  # Operational intensity range
y_compute = np.ones_like(x_range) * peak_gflops
y_memory = x_range * peak_bandwidth
y_roof = np.minimum(y_compute, y_memory)

plt.loglog(x_range, y_compute, 'b-', linewidth=2, label='Compute Roof')
plt.loglog(x_range, y_memory, 'r-', linewidth=2, label='Memory Roof')
plt.loglog(x_range, y_roof, 'k--', linewidth=2, label='Roofline')

# Plot each model
for model_name, info in roofline_results.items():
    op_intensity = info['Operational Intensity (FLOPS/Byte)']
    gflops_per_sec = info['Actual GFLOPS/sec']
    plt.loglog(op_intensity, gflops_per_sec, 'o', markersize=10, label=model_name)

plt.grid(True, which="both", ls="-", alpha=0.5)
plt.xlabel('Operational Intensity (FLOPS/Byte)', fontsize=12)
plt.ylabel('Performance (GFLOPS/sec)', fontsize=12)
plt.title('Roofline Model for YOLOv5 Variants', fontsize=14)
plt.legend(fontsize=10)
plt.tight_layout()
plt.savefig('yolov5_roofline_model.png')
plt.show()

# Extra Credit: Per-layer analysis
def analyze_layers(model, model_name):
    """Analyze the layers of a model to identify bottlenecks"""
    print(f"\n===== Per-Layer Analysis for {model_name} =====")
    
    # Prepare a sample input
    input_tensor = torch.randn(1, 3, 640, 640).to(device)
    
    # Dictionary to store layer info
    layer_info = {}
    
    # Register hooks to collect information for each layer
    hooks = []
    
    def hook_fn(m, i, o):
        # Skip non-compute layers
        if isinstance(m, (torch.nn.BatchNorm2d, torch.nn.ReLU)):
            return
        
        # Calculate input and output sizes
        if isinstance(i, (tuple, list)) and len(i) > 0:
            input_size = sum(inp.numel() * inp.element_size() for inp in i if isinstance(inp, torch.Tensor))
        elif isinstance(i, torch.Tensor):
            input_size = i.numel() * i.element_size()
        else:
            input_size = 0
            
        if isinstance(o, (tuple, list)):
            output_size = sum(out.numel() * out.element_size() for out in o if isinstance(out, torch.Tensor))
        elif isinstance(o, torch.Tensor):
            output_size = o.numel() * o.element_size()
        else:
            output_size = 0
        
        memory_access = input_size + output_size  # bytes
        
        # Estimate FLOPs (very rough estimation)
        flops = 0
        if isinstance(m, torch.nn.Conv2d):
            # For each output element: kernel_h * kernel_w * in_channels multiplications
            # and kernel_h * kernel_w * in_channels - 1 additions
            out_h, out_w = o.shape[2], o.shape[3]
            kernel_h, kernel_w = m.kernel_size
            in_channels = m.in_channels
            out_channels = m.out_channels
            groups = m.groups
            
            flops = 2 * out_h * out_w * kernel_h * kernel_w * in_channels * out_channels / groups
        elif isinstance(m, torch.nn.Linear):
            flops = 2 * m.in_features * m.out_features
        
        # Convert to GFLOPS
        gflops = flops / 1e9
        
        # Calculate operational intensity
        if memory_access > 0:
            operational_intensity = flops / memory_access
        else:
            operational_intensity = 0
        
        # Store layer information
        layer_name = str(m.__class__.__name__)
        if layer_name not in layer_info:
            layer_info[layer_name] = []
        
        layer_info[layer_name].append({
            'GFLOPs': gflops,
            'Memory Access (MB)': memory_access / 1e6,
            'Operational Intensity (FLOPS/Byte)': operational_intensity
        })
    
    # Register hook for each module
    for name, module in model.named_modules():
        hooks.append(module.register_forward_hook(hook_fn))
    
    # Run a forward pass
    with torch.no_grad():
        _ = model(input_tensor)
    
    # Remove hooks
    for hook in hooks:
        hook.remove()
    
    # Aggregate information for each layer type
    aggregated_info = {}
    for layer_name, data in layer_info.items():
        if not data:  # Skip empty lists
            continue
        
        total_gflops = sum(item['GFLOPs'] for item in data)
        total_memory = sum(item['Memory Access (MB)'] for item in data)
        avg_oi = sum(item['Operational Intensity (FLOPS/Byte)'] for item in data) / len(data)
        
        aggregated_info[layer_name] = {
            'Count': len(data),
            'Total GFLOPs': total_gflops,
            'Total Memory Access (MB)': total_memory,
            'Average Operational Intensity': avg_oi
        }
    
    # Sort layers by GFLOPs
    sorted_layers = sorted(
        aggregated_info.items(), 
        key=lambda x: x[1]['Total GFLOPs'], 
        reverse=True
    )
    
    # Print top 5 compute-intensive layers
    print("Top 5 compute-intensive layers:")
    for i, (layer_name, info) in enumerate(sorted_layers[:5], 1):
        print(f"{i}. {layer_name}: {info['Total GFLOPs']:.4f} GFLOPs, OI: {info['Average Operational Intensity']:.2f}")
    
    # Find layers with highest and lowest compute utilization
    # Assuming compute utilization is proportional to operational intensity
    max_oi_layer = max(aggregated_info.items(), key=lambda x: x[1]['Average Operational Intensity'])
    min_oi_layer = min(aggregated_info.items(), key=lambda x: x[1]['Average Operational Intensity'])
    
    print(f"\nHighest compute utilization: {max_oi_layer[0]} (OI: {max_oi_layer[1]['Average Operational Intensity']:.2f})")
    print(f"Lowest compute utilization: {min_oi_layer[0]} (OI: {min_oi_layer[1]['Average Operational Intensity']:.2f})")
    
    return aggregated_info

# Run per-layer analysis for each model (Extra Credit)
layer_analysis = {}
for model_name, model in models.items():
    layer_analysis[model_name] = analyze_layers(model, model_name)

Defaulting to user installation because normal site-packages is not writeable


fatal: cannot change to '/home/ved_maurya/sem4/Software': No such file or directory
YOLOv5 🚀 2025-4-9 Python-3.12.7 torch-2.6.0+cu124 CPU



Using device: cpu


Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs



===== Model Information for yolov5n =====


Fusing layers... 


Total Parameters: 1,867,405
Model Size: 7.17 MB
GFLOPS per inference: 4.50


YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs



===== Model Information for yolov5s =====


Fusing layers... 


Total Parameters: 7,225,885
Model Size: 27.61 MB
GFLOPS per inference: 16.49


YOLOv5m summary: 290 layers, 21172173 parameters, 0 gradients, 48.9 GFLOPs



===== Model Information for yolov5m =====
Total Parameters: 21,172,173
Model Size: 80.82 MB
GFLOPS per inference: 48.97


Fusing layers... 
YOLOv5l summary: 367 layers, 46533693 parameters, 0 gradients, 109.0 GFLOPs



===== Model Information for yolov5l =====
Total Parameters: 46,533,693
Model Size: 177.59 MB
GFLOPS per inference: 109.15


Fusing layers... 
YOLOv5x summary: 444 layers, 86705005 parameters, 0 gradients, 205.5 GFLOPs



===== Model Information for yolov5x =====
Total Parameters: 86,705,005
Model Size: 330.85 MB
GFLOPS per inference: 205.67

===== Model Information =====
         Total Parameters  Model Size (MB)  GFLOPS per inference
yolov5n           1867405         7.166678              4.496060
yolov5s           7225885        27.607780             16.485154
yolov5m          21172173        80.824740             48.967356
yolov5l          46533693       177.587478            109.145071
yolov5x          86705005       330.845027            205.669052
Estimating peak GFLOPS of the hardware...
Estimated peak GFLOPS: 316.14
Estimating peak memory bandwidth...
Estimated peak memory bandwidth: 11.36 GB/s

===== Roofline Analysis for yolov5n =====
Warming up yolov5n...
Benchmark run 1/1 for yolov5n


Processing images: 100%|██████████| 100/100 [00:05<00:00, 17.52it/s]


Run 1: GFLOPS/sec = 78.74
GFLOPS per inference: 4.50
Actual GFLOPS/sec: 78.74
Operational Intensity: 642.41 FLOPS/Byte
Bound Type: Compute-bound
Peak GFLOPS Utilization: 24.91%

===== Roofline Analysis for yolov5s =====
Warming up yolov5s...
Benchmark run 1/1 for yolov5s


Processing images: 100%|██████████| 100/100 [00:14<00:00,  6.88it/s]


Run 1: GFLOPS/sec = 113.34
GFLOPS per inference: 16.49
Actual GFLOPS/sec: 113.34
Operational Intensity: 611.45 FLOPS/Byte
Bound Type: Compute-bound
Peak GFLOPS Utilization: 35.85%

===== Roofline Analysis for yolov5m =====
Warming up yolov5m...
Benchmark run 1/1 for yolov5m


Processing images: 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Run 1: GFLOPS/sec = 177.45
GFLOPS per inference: 48.97
Actual GFLOPS/sec: 177.45
Operational Intensity: 620.39 FLOPS/Byte
Bound Type: Compute-bound
Peak GFLOPS Utilization: 56.13%

===== Roofline Analysis for yolov5l =====
Warming up yolov5l...
Benchmark run 1/1 for yolov5l


Processing images: 100%|██████████| 100/100 [00:53<00:00,  1.87it/s]


Run 1: GFLOPS/sec = 204.55
GFLOPS per inference: 109.15
Actual GFLOPS/sec: 204.55
Operational Intensity: 629.35 FLOPS/Byte
Bound Type: Compute-bound
Peak GFLOPS Utilization: 64.70%

===== Roofline Analysis for yolov5x =====
Warming up yolov5x...
Benchmark run 1/1 for yolov5x


Processing images: 100%|██████████| 100/100 [01:30<00:00,  1.10it/s]


Run 1: GFLOPS/sec = 227.04
GFLOPS per inference: 205.67
Actual GFLOPS/sec: 227.04
Operational Intensity: 636.57 FLOPS/Byte
Bound Type: Compute-bound
Peak GFLOPS Utilization: 71.82%

===== Roofline Analysis Results =====
         GFLOPS per inference  Actual GFLOPS/sec  \
yolov5n              4.496060          78.743310   
yolov5s             16.485154         113.343251   
yolov5m             48.967356         177.452033   
yolov5l            109.145071         204.553289   
yolov5x            205.669052         227.043043   

         Operational Intensity (FLOPS/Byte)     Bound Type  \
yolov5n                          642.412728  Compute-bound   
yolov5s                          611.450773  Compute-bound   
yolov5m                          620.386435  Compute-bound   
yolov5l                          629.349290  Compute-bound   
yolov5x                          636.567251  Compute-bound   

         Peak GFLOPS Utilization (%)  
yolov5n                    24.907408  
yolov5s         

# Q3


In [None]:
# YOLOv5 Code Profiling and Hotspot Analysis
import os
import time
import cProfile
import pstats
import io
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
import pandas as pd
import sys

# Make sure YOLOv5 is in path
sys.path.append('./yolov5')

# Import YOLOv5 modules
from models.experimental import attempt_load
from utils.general import check_img_size, non_max_suppression
from utils.torch_utils import select_device
from utils.augmentations import letterbox

# ======= Part 1: Define the full pipeline with timing for each component =======

def load_image(img_path):
    """Load an image from path"""
    start_time = time.time()
    img = Image.open(img_path).convert('RGB')
    img_np = np.array(img)
    load_time = time.time() - start_time
    return img_np, load_time

def preprocess_image(img_np, img_size=640, device='cpu'):
    """Preprocess an image for YOLOv5 inference"""
    start_time = time.time()
    
    # Resize and pad
    img = letterbox(img_np, img_size, stride=32)[0]
    
    # Convert from HWC to CHW format
    img = img.transpose(2, 0, 1)
    
    # Make contiguous in memory
    img = np.ascontiguousarray(img)
    
    # Convert to PyTorch tensor and move to device
    img = torch.from_numpy(img).to(device)
    
    # Normalize pixel values to 0-1
    img = img.float() / 255.0
    
    # Add batch dimension if needed
    if img.ndimension() == 3:
        img = img.unsqueeze(0)
    
    preprocess_time = time.time() - start_time
    return img, preprocess_time

def run_inference(model, img):
    """Run inference with PyTorch profiler for detailed analysis"""
    start_time = time.time()
    
    # Use PyTorch profiler to analyze the inference step
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA if torch.cuda.is_available() else torch.profiler.ProfilerActivity.CPU
        ],
        record_shapes=True,
        with_stack=False
    ) as prof:
        with torch.no_grad():  # Disable gradient calculation
            # Forward pass through model
            pred = model(img)[0]
    
    # Store the profiler results for later analysis
    profiler_results = prof
    
    # Apply non-maximum suppression
    pred = non_max_suppression(pred)
    
    inference_time = time.time() - start_time
    return pred, inference_time, profiler_results

def postprocess_results(pred, orig_shape, img_shape):
    """Post-process predictions to original image coordinates"""
    start_time = time.time()
    
    # Scale predictions back to original image size
    # This is a simplified version - in a real pipeline, we'd scale boxes back to original image coordinates
    results = []
    
    if len(pred[0]) > 0:
        # Get scaling factors
        scale_w = orig_shape[1] / img_shape[2]
        scale_h = orig_shape[0] / img_shape[1]
        
        # Scale each prediction
        for *xyxy, conf, cls in pred[0]:
            x1, y1, x2, y2 = [int(coord.item()) for coord in xyxy]
            x1, x2 = int(x1 * scale_w), int(x2 * scale_w)
            y1, y2 = int(y1 * scale_h), int(y2 * scale_h)
            
            results.append({
                'bbox': [x1, y1, x2, y2],
                'confidence': conf.item(),
                'class': int(cls.item())
            })
    
    postprocess_time = time.time() - start_time
    return results, postprocess_time

def process_single_image(model, img_path, img_size=640, device='cpu'):
    """Process a single image through the full pipeline with timing"""
    # Load image
    img_np, load_time = load_image(img_path)
    orig_shape = img_np.shape
    
    # Preprocess
    img, preprocess_time = preprocess_image(img_np, img_size, device)
    
    # Inference
    pred, inference_time, profiler_results = run_inference(model, img)
    
    # Postprocess
    results, postprocess_time = postprocess_results(pred, orig_shape, img.shape)
    
    # Total time
    total_time = load_time + preprocess_time + inference_time + postprocess_time
    
    # Return timing breakdown
    timing = {
        'load_time': load_time,
        'preprocess_time': preprocess_time,
        'inference_time': inference_time,
        'postprocess_time': postprocess_time,
        'total_time': total_time
    }
    
    return results, timing, profiler_results

# ======= Part 2: Setup profiling run =======

def main():
    """Main function to profile YOLOv5 inference pipeline"""
    # Set device
    device = select_device('0' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load model (using yolov5s for profiling)
    model_name = 'yolov5s'
    model_path = f'yolov5/weights/{model_name}.pt'
    
    if not os.path.exists(model_path):
        os.makedirs('yolov5/weights', exist_ok=True)
        !wget -P yolov5/weights/ https://github.com/ultralytics/yolov5/releases/download/v6.2/{model_name}.pt
    
    model = attempt_load(model_path, device=device)
    model.eval()
    
    # Get test images
    test_images = [os.path.join('test_images', f) for f in os.listdir('test_images') 
                  if f.endswith(('.jpg', '.jpeg', '.png'))]
    test_images = test_images[:20]  # Limit to 20 images for profiling
    
    if len(test_images) == 0:
        print("No test images found. Please run the setup code from Problem 1 first.")
        return
    
    # Run profiling with detailed timing
    all_timings = []
    detailed_profiler = None
    
    for i, img_path in enumerate(test_images):
        results, timing, profiler = process_single_image(model, img_path, device=device)
        all_timings.append(timing)
        
        # Store detailed profiler results for the first image only to avoid too much data
        if i == 0:
            detailed_profiler = profiler
    
    # Calculate average timings
    avg_timings = {
        'load_time': np.mean([t['load_time'] for t in all_timings]),
        'preprocess_time': np.mean([t['preprocess_time'] for t in all_timings]),
        'inference_time': np.mean([t['inference_time'] for t in all_timings]),
        'postprocess_time': np.mean([t['postprocess_time'] for t in all_timings]),
        'total_time': np.mean([t['total_time'] for t in all_timings])
    }
    
    # Calculate percentages
    total = avg_timings['total_time']
    percentages = {
        'load_time': (avg_timings['load_time'] / total) * 100,
        'preprocess_time': (avg_timings['preprocess_time'] / total) * 100,
        'inference_time': (avg_timings['inference_time'] / total) * 100,
        'postprocess_time': (avg_timings['postprocess_time'] / total) * 100
    }
    
    # Report results
    print("\n===== Pipeline Timing Breakdown =====")
    print(f"Average time over {len(test_images)} images:")
    print(f"Loading: {avg_timings['load_time']*1000:.2f} ms ({percentages['load_time']:.1f}%)")
    print(f"Preprocessing: {avg_timings['preprocess_time']*1000:.2f} ms ({percentages['preprocess_time']:.1f}%)")
    print(f"Inference: {avg_timings['inference_time']*1000:.2f} ms ({percentages['inference_time']:.1f}%)")
    print(f"Postprocessing: {avg_timings['postprocess_time']*1000:.2f} ms ({percentages['postprocess_time']:.1f}%)")
    print(f"Total: {avg_timings['total_time']*1000:.2f} ms")
    
    # Visualize the breakdown
    plt.figure(figsize=(10, 6))
    components = ['Loading', 'Preprocessing', 'Inference', 'Postprocessing']
    values = [percentages['load_time'], percentages['preprocess_time'], 
              percentages['inference_time'], percentages['postprocess_time']]
    
    plt.bar(components, values)
    plt.title('Time Breakdown by Pipeline Component (%)')
    plt.ylabel('Percentage of Total Time')
    plt.savefig('pipeline_breakdown.png')
    
    # Print PyTorch profiler results
    print("\n===== PyTorch Profiler Results (Top 10 Operations) =====")
    print(detailed_profiler.key_averages().table(sort_by="cpu_time_total", row_limit=10))
    
    # Export CPU vs GPU time if available
    if torch.cuda.is_available():
        cpu_time = detailed_profiler.key_averages().total_average().cpu_time
        cuda_time = detailed_profiler.key_averages().total_average().cuda_time
        print(f"\nCPU Time: {cpu_time:.2f} us")
        print(f"GPU Time: {cuda_time:.2f} us")
        print(f"CPU/GPU Time Ratio: {cpu_time/cuda_time if cuda_time else 'N/A'}")
    
    # Save full profiler results to file
    with open('pytorch_profiler_results.txt', 'w') as f:
        f.write(detailed_profiler.key_averages().table(sort_by="cpu_time_total"))
    
    return "Profiling complete"

# ======= Part 3: Run the profiling =======

# Method 1: Use cProfile
print("Running cProfile analysis...")
cProfile.run('main()', 'profile_output')

# Print formatted cProfile results
p = pstats.Stats('profile_output')
p.strip_dirs().sort_stats('cumulative').print_stats(20)  # Print top 20 functions by cumulative time

# Save detailed cProfile results
with open('cprofile_results.txt', 'w') as f:
    stats = pstats.Stats('profile_output', stream=f)
    stats.strip_dirs().sort_stats('cumulative').print_stats()

# Create a human-readable summary from cProfile
s = io.StringIO()
ps = pstats.Stats('profile_output', stream=s).strip_dirs().sort_stats('cumulative')
ps.print_stats(30)  # Top 30 functions
cprofile_output = s.getvalue()

# Parse cProfile output to extract key information
def parse_cprofile_output(output):
    lines = output.split('\n')
    parsed_data = []
    
    # Skip header lines
    data_lines = [line for line in lines if line.strip() and not line.startswith('ncalls') and not line.startswith('   ')]
    
    for line in data_lines:
        if '{method' in line or '<built-in' in line:
            continue
        parts = line.strip().split()
        if len(parts) >= 5:
            try:
                function_name = ' '.join(parts[5:])
                if '/' in parts[0]:  # Handle case with primitive/total calls
                    total_calls = parts[0].split('/')[0]
                else:
                    total_calls = parts[0]
                
                total_time = float(parts[1])
                per_call = float(parts[2])
                cumulative = float(parts[3])
                
                parsed_data.append({
                    'function': function_name,
                    'calls': total_calls,
                    'total_time': total_time,
                    'per_call': per_call,
                    'cumulative': cumulative
                })
            except (ValueError, IndexError):
                pass  # Skip lines that don't match expected format
    
    return parsed_data

parsed_cprofile = parse_cprofile_output(cprofile_output)

# Create summary visualization of cProfile results
if parsed_cprofile:
    # Get top 10 functions by total time
    top_functions = sorted(parsed_cprofile, key=lambda x: x['total_time'], reverse=True)[:10]
    
    plt.figure(figsize=(12, 6))
    plt.barh([f['function'][:50] for f in top_functions], [f['total_time'] for f in top_functions])
    plt.xlabel('Total Time (seconds)')
    plt.title('Top 10 Time-Consuming Functions (cProfile)')
    plt.tight_layout()
    plt.savefig('cprofile_top_functions.png')
    
    # Save parsed data to CSV
    df = pd.DataFrame(parsed_cprofile)
    df.to_csv('cprofile_functions.csv', index=False)

print("\nAnalysis complete. Check the generated files for detailed results.")

fatal: cannot change to '/home/ved_maurya/sem4/Software': No such file or directory
YOLOv5 🚀 2025-4-9 Python-3.12.7 torch-2.6.0+cu124 CPU



Fusing layers... 


Running cProfile analysis...
Using device: cpu


YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs



===== Pipeline Timing Breakdown =====
Average time over 20 images:
Loading: 2.63 ms (2.2%)
Preprocessing: 2.03 ms (1.7%)
Inference: 117.07 ms (96.0%)
Postprocessing: 0.16 ms (0.1%)
Total: 121.89 ms

===== PyTorch Profiler Results (Top 10 Operations) =====
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::conv2d         1.16%       1.011ms        82.60%      72.312ms       1.205ms            60  
                aten::convolution         0.66%     575.811us        81.45%      71.300ms       1.188ms            60  
               aten::_convolution         0.91%     794.962us        80.79%      70.725ms       1.179ms            60  
         aten::mkldnn_c

# Q4

In [1]:
import torch
import time
import numpy as np
import cv2
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
import concurrent.futures
from tqdm import tqdm

# FIXED VERSION: Guaranteed consistent size preprocessing
def preprocess_image_fixed(img_path, img_size=640, to_half=False):
    """
    Preprocess an image with guaranteed consistent dimensions
    """
    # Read image
    img = cv2.imread(img_path)
    if img is None:
        raise ValueError(f"Could not read image: {img_path}")
    
    # Convert BGR to RGB
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Resize and pad to square with consistent dimensions
    h, w = img.shape[:2]
    r = img_size / max(h, w)  # Resize ratio
    if r != 1:  # Only resize if necessary
        interp = cv2.INTER_LINEAR
        new_size = (int(w * r), int(h * r))
        img = cv2.resize(img, new_size, interpolation=interp)
    
    # Create square canvas
    new_img = np.full((img_size, img_size, 3), 114, dtype=np.uint8)
    
    # Center image on canvas
    h, w = img.shape[:2]
    offset_h, offset_w = (img_size - h) // 2, (img_size - w) // 2
    new_img[offset_h:offset_h + h, offset_w:offset_w + w] = img
    
    # Convert HWC to CHW format
    img = new_img.transpose(2, 0, 1)
    
    # Convert to tensor and normalize
    img = torch.from_numpy(img).float() / 255.0
    
    # Add batch dimension
    img = img.unsqueeze(0)
    
    # Convert to half precision if requested
    if to_half:
        img = img.half()
        
    return img

# Proper inference function
def run_inference_fixed(model, img, conf_thres=0.25, iou_thres=0.45):
    """Run inference with proper type handling"""
    with torch.no_grad():
        # Forward pass
        output = model(img)
        
        # Non-maximum suppression
        if isinstance(output, tuple):
            output = output[0]  # Handle models that return multiple outputs
        
        pred = non_max_suppression(output, conf_thres=conf_thres, iou_thres=iou_thres)
    
    return pred

# 1. BASELINE BENCHMARK
def benchmark_baseline(model_name, model, image_paths, num_runs=3):
    """Benchmark baseline performance"""
    model = model.float()  # Ensure model is in FP32
    
    total_time = 0
    total_images = len(image_paths)
    total_detections = 0
    
    # Warmup
    print(f"Warming up baseline {model_name}...")
    for img_path in image_paths[:3]:
        img = preprocess_image_fixed(img_path)
        _ = run_inference_fixed(model, img)
    
    # Benchmark
    start_time = time.time()
    
    for _ in range(num_runs):
        for img_path in tqdm(image_paths, desc=f"Baseline {model_name}", leave=False):
            img = preprocess_image_fixed(img_path)
            pred = run_inference_fixed(model, img)
            
            # Count detections
            for det in pred:
                total_detections += len(det)
    
    end_time = time.time()
    total_time = end_time - start_time
    
    # Calculate metrics
    avg_time = total_time / (total_images * num_runs)
    fps = (total_images * num_runs) / total_time
    avg_detections = total_detections / (total_images * num_runs)
    
    return {
        'model_name': model_name,
        'total_time': total_time,
        'avg_time_per_image': avg_time,
        'fps': fps,
        'avg_detections': avg_detections,
        'optimization': 'Baseline'
    }

# 2. FP16 BENCHMARK
def benchmark_fp16(model_name, base_model, image_paths, num_runs=3):
    """Benchmark FP16 performance (only beneficial with GPU)"""
    # Check hardware FP16 support
    has_gpu = torch.cuda.is_available()
    
    if has_gpu:
        device = torch.device('cuda')
        model = base_model.to(device).half()
    else:
        print("WARNING: Running FP16 on CPU may be slower than baseline FP32")
        model = base_model.half()
    
    total_time = 0
    total_images = len(image_paths)
    total_detections = 0
    
    # Warmup
    print(f"Warming up FP16 {model_name}...")
    for img_path in image_paths[:3]:
        img = preprocess_image_fixed(img_path, to_half=True)
        if has_gpu:
            img = img.to(device)
        _ = run_inference_fixed(model, img)
    
    # Benchmark
    start_time = time.time()
    
    for _ in range(num_runs):
        for img_path in tqdm(image_paths, desc=f"FP16 {model_name}", leave=False):
            img = preprocess_image_fixed(img_path, to_half=True)
            if has_gpu:
                img = img.to(device)
            pred = run_inference_fixed(model, img)
            
            # Count detections
            for det in pred:
                total_detections += len(det)
    
    end_time = time.time()
    total_time = end_time - start_time
    
    return {
        'model_name': f"{model_name} (FP16)",
        'total_time': total_time,
        'avg_time_per_image': total_time / (total_images * num_runs),
        'fps': (total_images * num_runs) / total_time,
        'avg_detections': total_detections / (total_images * num_runs),
        'optimization': 'FP16'
    }

# 3. BATCH PROCESSING BENCHMARK
def benchmark_batch(model_name, base_model, image_paths, batch_size=4, num_runs=3):
    """Benchmark batch processing"""
    model = base_model.float()
    
    total_time = 0
    total_images = len(image_paths)
    total_detections = 0
    
    # Prepare batched data
    print(f"Preparing batches for {model_name}...")
    batches = []
    
    # Process full batches
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i+batch_size]
        if len(batch_paths) == batch_size:  # Only use full batches
            # Pre-process all images to same size
            batch_imgs = torch.cat([preprocess_image_fixed(path) for path in batch_paths], dim=0)
            batches.append((batch_imgs, batch_paths))
    
    # Process individual images for remainder
    remainder_paths = image_paths[len(batches) * batch_size:]
    
    # Warmup
    print(f"Warming up batch {model_name}...")
    if batches:
        batch_imgs, _ = batches[0]
        _ = model(batch_imgs)
    
    # Benchmark
    start_time = time.time()
    
    for _ in range(num_runs):
        # Process batches
        for batch_imgs, batch_paths in tqdm(batches, desc=f"Batch {model_name}", leave=False):
            with torch.no_grad():
                # Forward pass
                outputs = model(batch_imgs)
                
                # Apply NMS
                preds = non_max_suppression(outputs)
                
                # Count detections
                for det in preds:
                    total_detections += len(det)
        
        # Process remainder individually
        for img_path in remainder_paths:
            img = preprocess_image_fixed(img_path)
            pred = run_inference_fixed(model, img)
            
            for det in pred:
                total_detections += len(det)
    
    end_time = time.time()
    total_time = end_time - start_time
    
    # Account for remainder in metrics
    processed_images = len(batches) * batch_size + len(remainder_paths)
    
    return {
        'model_name': f"{model_name} (Batch Size={batch_size})",
        'total_time': total_time,
        'avg_time_per_image': total_time / (processed_images * num_runs),
        'fps': (processed_images * num_runs) / total_time,
        'avg_detections': total_detections / (processed_images * num_runs),
        'optimization': 'Batch Processing'
    }

# 4. MULTI-THREADING BENCHMARK
def process_single_image(model, img_path):
    """Process a single image for threading"""
    try:
        img = preprocess_image_fixed(img_path)
        pred = run_inference_fixed(model, img)
        
        # Count detections
        total = 0
        for det in pred:
            total += len(det)
        
        return total
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        return 0

def benchmark_threading(model_name, base_model, image_paths, num_threads=4, num_runs=3):
    """Benchmark multi-threading"""
    # Clone model to avoid any shared state issues
    model = base_model.float()
    
    # Calculate optimal thread count if not specified
    if num_threads <= 0:
        import multiprocessing
        num_threads = min(multiprocessing.cpu_count(), 8)  # Limit to 8 max
    
    total_time = 0
    total_images = len(image_paths)
    total_detections = 0
    
    # Warmup
    print(f"Warming up threaded {model_name}...")
    process_single_image(model, image_paths[0])
    
    # Benchmark
    start_time = time.time()
    
    for run in range(num_runs):
        print(f"Running threaded inference (Run {run+1}/{num_runs})...")
        
        # Process images in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            # Map function to all images
            futures = {executor.submit(process_single_image, model, path): path for path in image_paths}
            
            # Process results as they complete
            completed = 0
            for future in concurrent.futures.as_completed(futures):
                detections = future.result()
                total_detections += detections
                completed += 1
                
                if completed % 10 == 0:
                    print(f"  Completed {completed}/{total_images} images")
    
    end_time = time.time()
    total_time = end_time - start_time
    
    return {
        'model_name': f"{model_name} (Threads={num_threads})",
        'total_time': total_time,
        'avg_time_per_image': total_time / (total_images * num_runs),
        'fps': (total_images * num_runs) / total_time,
        'avg_detections': total_detections / (total_images * num_runs),
        'optimization': 'Threading'
    }

# Run all benchmarks
def run_benchmarks(model_name, model, image_paths, num_runs=3, batch_size=4, num_threads=4):
    """Run all benchmarks and compare results"""
    print(f"\n===== Performance Optimization Benchmarks for {model_name} =====")
    results = []
    
    # 1. Baseline
    try:
        baseline = benchmark_baseline(model_name, model, image_paths, num_runs)
        results.append(baseline)
    except Exception as e:
        print(f"Baseline benchmark failed: {e}")
    
    # 2. FP16
    try:
        fp16 = benchmark_fp16(model_name, model, image_paths, num_runs)
        results.append(fp16)
    except Exception as e:
        print(f"FP16 benchmark failed: {e}")
    
    # 3. Batch Processing
    try:
        batch = benchmark_batch(model_name, model, image_paths, batch_size, num_runs)
        results.append(batch)
    except Exception as e:
        print(f"Batch processing benchmark failed: {e}")
    
    # 4. Threading
    try:
        threaded = benchmark_threading(model_name, model, image_paths, num_threads, num_runs)
        results.append(threaded)
    except Exception as e:
        print(f"Threading benchmark failed: {e}")
    
    # Create DataFrame and calculate speedups
    if results:
        df = pd.DataFrame(results)
        
        # Sort by optimization type
        df = df.sort_values('optimization')
        
        # Calculate speedups relative to baseline
        baseline_fps = df[df['optimization'] == 'Baseline']['fps'].values[0]
        df['speedup'] = df['fps'] / baseline_fps
        
        # Print results
        print("\n===== Optimization Results =====")
        print(df[['model_name', 'avg_time_per_image', 'fps', 'avg_detections', 'speedup']])
        
        # Plot results
        plt.figure(figsize=(12, 6))
        plt.bar(df['model_name'], df['fps'])
        plt.title('Performance Comparison (FPS)')
        plt.ylabel('Frames per Second')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'{model_name}_fps_comparison.png')
        plt.show()
        
        return df
    else:
        print("No successful benchmarks to report.")
        return None

In [12]:
# Define benchmark parameters
model_name = 'yolov5s'
model = models[model_name]
num_runs = 1  # Increase for more reliable results
batch_size = 4  # Experiment with different batch sizes
num_threads = 4  # Set to number of CPU cores for best results

# Define test images if not already defined
if 'test_images' not in locals():
    test_images = [os.path.join('test_images', f) for f in os.listdir('test_images') 
                   if f.endswith(('.jpg', '.jpeg', '.png'))]

    # Ensure at least 100 images are available
    if len(test_images) < 100:
        while len(test_images) < 100:
            test_images.append(test_images[len(test_images) % len(test_images)])
    test_images = test_images[:100]

# Use a smaller set of images for faster testing
benchmark_images = test_images[:20]

# Run all benchmarks
results = run_benchmarks(
    model_name, 
    model, 
    benchmark_images, 
    num_runs=num_runs,
    batch_size=batch_size,
    num_threads=num_threads
)

# Save results
if results is not None:
    results.to_csv(f'{model_name}_optimization_results.csv')


===== Performance Optimization Benchmarks for yolov5s =====
Warming up baseline yolov5s...


                                                                 

Warming up FP16 yolov5s...


                                                             

Preparing batches for yolov5s...
Warming up batch yolov5s...


                                                            

Warming up threaded yolov5s...
Running threaded inference (Run 1/1)...
  Completed 10/20 images
  Completed 20/20 images

===== Optimization Results =====
               model_name  avg_time_per_image       fps  avg_detections  \
0                 yolov5s            0.436522  2.290837             4.9   
2  yolov5s (Batch Size=4)            0.163529  6.115108             4.9   
1          yolov5s (FP16)           11.643311  0.085886             4.9   
3     yolov5s (Threads=4)            0.117560  8.506299             4.9   

    speedup  
0  1.000000  
2  2.669376  
1  0.037491  
3  3.713183  
