# YOLO Object Detection Algorithm - Step-by-Step Visualization

This notebook demonstrates each step of the YOLO (You Only Look Once) object detection algorithm. We'll use a pre-trained YOLOv5 model to visualize:

1. **Grid Structure** - How YOLO divides images into detection grids
2. **Anchor Boxes** - Predefined box shapes used for detection
3. **Raw Predictions** - All initial model outputs before filtering
4. **Class Predictions** - What the model thinks each box contains
5. **Filtering** - Confidence thresholding and Non-Maximum Suppression
6. **Final Results** - Clean, labeled bounding boxes


## 1. Setup & Model Loading

Run the following magic command first to enable inline plotting:


In [None]:
# Configure matplotlib for saving figures
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
import os
os.makedirs('yolo_figures', exist_ok=True)

# Counter for figure numbering
_fig_num = 0

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from torchvision.ops import nms
from scipy.ndimage import zoom

# Helper function to save figures instead of showing
def showfig(title="figure"):
    global _fig_num
    _fig_num += 1
    filename = f"yolo_figures/fig_{_fig_num:02d}_{title.replace(' ', '_')}.png"
    plt.savefig(filename, dpi=150, bbox_inches='tight')
    print(f"Saved: {filename}")
    plt.close()

# Set device (use MPS if available on Mac, then CUDA, otherwise CPU)
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print(f"Using device: MPS (Apple Silicon GPU)")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using device: CUDA")
else:
    device = torch.device('cpu')
    print(f"Using device: CPU")


In [None]:
# Load YOLOv5s model from PyTorch Hub
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
model = model.to(device)
model.eval()

print("Model loaded successfully!")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")


In [None]:
# Load test image and resize to exactly 640x640 to match YOLO's expected input
image_path = '/Users/alex/Downloads/food.webp'
image = Image.open(image_path).convert('RGB')

# Resize to exactly 640x640 to avoid scaling issues with YOLO predictions
image = image.resize((640, 640), Image.LANCZOS)
image_np = np.array(image)

# Display original image
plt.figure(figsize=(12, 8))
plt.imshow(image_np)
plt.title('Test Image (640x640)')
plt.axis('off')
showfig("test_image")

print(f"Image size: {image.size[0]}x{image.size[1]}")


## 2. Grid Visualization

YOLO divides the image into a grid and makes predictions for each cell. YOLOv5 uses three different grid scales:
- **80x80 grid** - Detects small objects (uses smaller cell size)
- **40x40 grid** - Detects medium objects
- **20x20 grid** - Detects large objects (uses larger cell size)


In [None]:
# Image is already 640x640, so no preprocessing needed
# This matches what YOLO expects
preprocessed_img = image_np

print(f"Using preprocessed image: {preprocessed_img.shape[1]}x{preprocessed_img.shape[0]}")

def visualize_grid(image_np, grid_size, title):
    """Visualize grid overlaid on image"""
    h, w = image_np.shape[:2]
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    ax.imshow(image_np)
    
    # Draw grid lines
    for i in range(grid_size + 1):
        x = (w / grid_size) * i
        y = (h / grid_size) * i
        ax.axvline(x, color='red', linewidth=0.5, alpha=0.5)
        ax.axhline(y, color='red', linewidth=0.5, alpha=0.5)
    
    ax.set_title(f'{title}\nGrid Size: {grid_size}x{grid_size} cells')
    ax.axis('off')
    plt.tight_layout()
    showfig(f"grid_{grid_size}")

# Visualize all three grid scales
visualize_grid(preprocessed_img, 80, 'Small Objects Scale')
visualize_grid(preprocessed_img, 40, 'Medium Objects Scale')
visualize_grid(preprocessed_img, 20, 'Large Objects Scale')


## 3. Anchor Boxes

Each grid cell predicts bounding boxes using **anchor boxes** - predefined box shapes. YOLOv5 uses 3 anchor boxes per scale, optimized for different object shapes. This allows the model to detect objects of various aspect ratios within the same grid cell.


In [None]:
# YOLOv5 anchor boxes for different scales (width, height)
# These are pre-computed anchor shapes for the three detection heads
anchors = {
    '80x80': [(10, 13), (16, 30), (33, 23)],  # Small objects scale
    '40x40': [(30, 61), (62, 45), (59, 119)], # Medium objects scale  
    '20x20': [(116, 90), (156, 198), (373, 326)] # Large objects scale
}

def visualize_anchors(image_np, grid_size, anchors_list, title):
    """Visualize anchor boxes centered on a sample grid cell"""
    h, w = image_np.shape[:2]
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    ax.imshow(image_np)
    
    # Pick a grid cell near the center to visualize
    cell_x, cell_y = grid_size // 2, grid_size // 2
    cell_center_x = (cell_x + 0.5) * (w / grid_size)
    cell_center_y = (cell_y + 0.5) * (h / grid_size)
    
    # Draw the grid cell boundary
    cell_w = w / grid_size
    cell_h = h / grid_size
    cell_x_pos = cell_x * cell_w
    cell_y_pos = cell_y * cell_h
    
    rect = plt.Rectangle((cell_x_pos, cell_y_pos), cell_w, cell_h, 
                        linewidth=3, edgecolor='red', facecolor='none')
    ax.add_patch(rect)
    
    # Draw each anchor box centered on the cell
    colors = ['blue', 'green', 'yellow']
    for i, (anchor_w, anchor_h) in enumerate(anchors_list):
        # Convert anchor size to image coordinates (anchors are normalized)
        box_w = (anchor_w / 640.0) * w  # 640 is YOLO's reference size
        box_h = (anchor_h / 640.0) * h
        
        # Draw anchor box centered on the cell
        box_x = cell_center_x - box_w / 2
        box_y = cell_center_y - box_h / 2
        
        rect = plt.Rectangle((box_x, box_y), box_w, box_h, 
                            linewidth=2, edgecolor=colors[i], 
                            facecolor=colors[i], alpha=0.3, label=f'Anchor {i+1}')
        ax.add_patch(rect)
    
    ax.set_title(f'{title}\n3 anchor boxes per grid cell', fontsize=14, fontweight='bold')
    ax.legend(loc='upper right')
    ax.axis('off')
    plt.tight_layout()
    showfig(f"anchors_{grid_size}")

# Visualize anchors for all three scales
visualize_anchors(preprocessed_img, 80, anchors['80x80'], 'Small Objects (80x80 grid)')
visualize_anchors(preprocessed_img, 40, anchors['40x40'], 'Medium Objects (40x40 grid)')
visualize_anchors(preprocessed_img, 20, anchors['20x20'], 'Large Objects (20x20 grid)')


## 4. Raw Predictions (Before Filtering)

The model produces thousands of raw predictions - one for each anchor box in every grid cell. Before any filtering, we can see ALL predictions the model makes. Notice how densely packed and noisy these predictions are!


In [None]:
# Configure model to get ALL predictions with very low thresholds
model.conf = 0.01  # Very low confidence threshold
model.iou = 1.0    # Disable NMS (set IoU threshold to 1.0)

# Run inference to get raw predictions
results = model(image_path)
raw_predictions = results.xyxy[0].cpu().numpy()

print(f"Total raw predictions: {len(raw_predictions):,}")

# Get class names for color-coding
class_names = model.names

# Visualize all raw predictions
plt.figure(figsize=(14, 14))
ax = plt.gca()
ax.imshow(preprocessed_img)

# Color map for classes
colors = plt.cm.tab20(np.linspace(0, 1, len(class_names)))

# Track which classes are used for legend
used_classes = set()

# Draw all predicted boxes
for i, pred in enumerate(raw_predictions):
    x1, y1, x2, y2, conf, cls = pred
    cls_id = int(cls)
    w, h = x2 - x1, y2 - y1
    
    # Color by predicted class
    color = colors[cls_id % len(colors)]
    
    # Track this class for legend
    used_classes.add(cls_id)
    
    # Line thickness by objectness (confidence)
    # Scale from 0.5 to 3 based on confidence (0-1)
    linewidth = 0.5 + conf * 2.5
    
    rect = plt.Rectangle((x1, y1), w, h, linewidth=linewidth, 
                        edgecolor=color, facecolor='none', alpha=0.6)
    ax.add_patch(rect)

ax.set_title(f'All Raw Predictions\n' +
             'Color: predicted class | Line thickness: objectness', 
            fontsize=14, fontweight='bold')

# Add legend for visible classes
from matplotlib.patches import Patch
legend_elements = []
for cls_id in sorted(used_classes):
    color = colors[cls_id % len(colors)]
    class_name = class_names.get(cls_id, f"class_{cls_id}")
    legend_elements.append(Patch(facecolor=color, edgecolor='black', label=class_name))

ax.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.15, 1), 
          fontsize=9, framealpha=0.9)

ax.axis('off')
plt.tight_layout()
plt.savefig('yolo_figures/raw_predictions.png')


In [None]:
# Get class names
class_names = model.names

# Show confidence distribution
confidence_scores = raw_predictions[:, 4]

plt.figure(figsize=(12, 5))
plt.hist(confidence_scores, bins=50, edgecolor='black')
plt.xlabel('Confidence Score')
plt.ylabel('Number of Predictions')
plt.title('Distribution of Confidence Scores\n(Before Filtering)')
plt.axvline(0.25, color='red', linestyle='--', label='Default threshold (0.25)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('yolo_figures/confidence_cutoff.png')



print(f"Statistics:")
print(f"  Total predictions: {len(confidence_scores):,}")
print(f"  Mean confidence: {confidence_scores.mean():.4f}")
print(f"  Max confidence: {confidence_scores.max():.4f}")


## 5. Class Predictions

For each predicted box, YOLO outputs:
1. **Objectness score**: Confidence that an object exists (regardless of class)
2. **Class probabilities**: A probability distribution over all 80 COCO classes
3. **Final confidence**: objectness × max(class_probability)

Let's examine the top predictions and their class distributions.


In [None]:
# Sort predictions by confidence and look at top 5
sorted_preds = raw_predictions[np.argsort(raw_predictions[:, 4])[::-1]]

print("Top 5 Predictions (by confidence):")
print("=" * 80)
for i, pred in enumerate(sorted_preds[:5]):
    x1, y1, x2, y2, conf, cls_id = pred
    cls_id = int(cls_id)
    class_name = class_names.get(cls_id, f"class_{cls_id}")
    print(f"\n{i+1}. {class_name}")
    print(f"   Confidence: {conf:.4f}")
    print(f"   Bounding box: ({x1:.0f}, {y1:.0f}) to ({x2:.0f}, {y2:.0f})")

# Visualize top predictions with labels
plt.figure(figsize=(14, 14))
ax = plt.gca()
ax.imshow(preprocessed_img)

# Draw top 5 predictions
colors = plt.cm.tab10(np.linspace(0, 1, 5))
for i, (pred, color) in enumerate(zip(sorted_preds[:5], colors)):
    x1, y1, x2, y2, conf, cls_id = pred
    cls_id = int(cls_id)
    class_name = class_names.get(cls_id, f"class_{cls_id}")
    w, h = x2 - x1, y2 - y1
    
    rect = plt.Rectangle((x1, y1), w, h, linewidth=3, 
                        edgecolor=color, facecolor='none', alpha=0.8)
    ax.add_patch(rect)
    
    # Add label
    label = f'{i+1}. {class_name} ({conf:.3f})'
    ax.text(x1, y1-5, label, 
           bbox=dict(boxstyle='round,pad=0.3', facecolor=color, alpha=0.7),
           fontsize=10, fontweight='bold', color='white')

ax.set_title('Top 5 Predictions by Confidence Score', fontsize=16, fontweight='bold')
ax.axis('off')
plt.tight_layout()
plt.show()


## 6a. Confidence Filtering

The first filtering step removes predictions with low confidence. We apply a threshold (typically 0.25) to keep only confident detections.


In [None]:
# Apply confidence thresholding
confidence_threshold = 0.25
filtered_by_confidence = raw_predictions[raw_predictions[:, 4] >= confidence_threshold]

print(f"Predictions before confidence filtering: {len(raw_predictions):,}")
print(f"Predictions after confidence filtering (threshold={confidence_threshold}): {len(filtered_by_confidence):,}")
print(f"Reduction: {len(raw_predictions) - len(filtered_by_confidence):,} predictions removed "
      f"({100*(1 - len(filtered_by_confidence)/len(raw_predictions)):.1f}%)")

# Visualize comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# Before filtering
ax1.imshow(preprocessed_img)
for pred in raw_predictions[:100]:  # Show first 100
    x1, y1, x2, y2 = pred[:4]
    w, h = x2 - x1, y2 - y1
    rect = plt.Rectangle((x1, y1), w, h, linewidth=1.5, 
                        edgecolor='cyan', facecolor='none', alpha=0.6)
    ax1.add_patch(rect)
ax1.set_title(f'Before Confidence Filtering\n{len(raw_predictions):,} predictions (showing 100)', 
             fontsize=14, fontweight='bold')
ax1.axis('off')

# After filtering
ax2.imshow(preprocessed_img)
for pred in filtered_by_confidence:
    x1, y1, x2, y2 = pred[:4]
    w, h = x2 - x1, y2 - y1
    rect = plt.Rectangle((x1, y1), w, h, linewidth=2, 
                        edgecolor='yellow', facecolor='none', alpha=0.7)
    ax2.add_patch(rect)
ax2.set_title(f'After Confidence Filtering (threshold={confidence_threshold})\n{len(filtered_by_confidence):,} predictions', 
             fontsize=14, fontweight='bold')
ax2.axis('off')

plt.tight_layout()
plt.savefig('yolo_figures/confident_boxes.png')


## 6b. Non-Maximum Suppression (NMS)

NMS removes duplicate detections of the same object by:
1. Sorting boxes by confidence
2. Keeping the highest-confidence box
3. Removing other boxes that overlap significantly (high IoU) with it
4. Repeating for remaining boxes

In [None]:
# Apply NMS to each class separately
final_predictions = []

unique_classes = np.unique(filtered_by_confidence[:, 5])

for cls in unique_classes:
    # Get predictions for this class
    cls_predictions = filtered_by_confidence[filtered_by_confidence[:, 5] == cls]
    
    # Convert to tensors for NMS
    boxes = torch.from_numpy(cls_predictions[:, :4]).float()
    scores = torch.from_numpy(cls_predictions[:, 4]).float()
    
    # Apply NMS with IoU threshold 0.45
    keep_indices = nms(boxes, scores, iou_threshold=0.45)
    
    # Convert keep_indices to numpy array and get the predictions
    keep_indices_np = keep_indices.cpu().numpy()
    if len(cls_predictions) > 0 and len(keep_indices_np) > 0:
        # Ensure we have a 2D array
        kept_preds = cls_predictions[keep_indices_np]
        if len(kept_preds.shape) == 1:
            kept_preds = kept_preds.reshape(1, -1)
        final_predictions.append(kept_preds)

# Concatenate all class predictions
final_predictions = np.concatenate(final_predictions, axis=0) if final_predictions else np.array([]).reshape(0, 6)

print(f"Predictions before NMS: {len(filtered_by_confidence)}")
print(f"Predictions after NMS: {len(final_predictions)}")
print(f"Reduction: {len(filtered_by_confidence) - len(final_predictions)} duplicates removed "
      f"({100*(1 - len(final_predictions)/len(filtered_by_confidence)):.1f}%)")

# Visualize comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# Before NMS
ax1.imshow(preprocessed_img)
for pred in filtered_by_confidence[:50]:  # Show first 50
    x1, y1, x2, y2 = pred[:4]
    w, h = x2 - x1, y2 - y1
    rect = plt.Rectangle((x1, y1), w, h, linewidth=2, 
                        edgecolor='yellow', facecolor='none', alpha=0.7)
    ax1.add_patch(rect)
ax1.set_title(f'Before NMS\n{len(filtered_by_confidence)} predictions (showing 50)', 
             fontsize=14, fontweight='bold')
ax1.axis('off')

# After NMS
ax2.imshow(preprocessed_img)
for pred in final_predictions:
    x1, y1, x2, y2 = pred[:4]
    w, h = x2 - x1, y2 - y1
    rect = plt.Rectangle((x1, y1), w, h, linewidth=2, 
                        edgecolor='green', facecolor='none', alpha=0.7)
    ax2.add_patch(rect)
ax2.set_title(f'After NMS\n{len(final_predictions)} final predictions', 
             fontsize=14, fontweight='bold')
ax2.axis('off')

plt.tight_layout()
plt.savefig('yolo_figures/nms_fig.png')


## 7. Final Results

Here are the final, cleaned detections with class labels and confidence scores:


In [None]:
# Visualize final results with labels
plt.figure(figsize=(14, 14))
ax = plt.gca()
ax.imshow(preprocessed_img)

# Color-code by class
colors = plt.cm.tab20(np.linspace(0, 1, len(class_names)))

for pred in final_predictions:
    x1, y1, x2, y2, conf, cls_id = pred
    cls_id = int(cls_id)
    class_name = class_names.get(cls_id, f"class_{cls_id}")
    w, h = x2 - x1, y2 - y1
    color = colors[cls_id % len(colors)]
    
    # Draw bounding box
    rect = plt.Rectangle((x1, y1), w, h, linewidth=3, 
                        edgecolor=color, facecolor='none', alpha=0.8)
    ax.add_patch(rect)
    
    # Draw label with confidence
    label = f'{class_name}: {conf:.2f}'
    ax.text(x1, y1-5, label, 
           bbox=dict(boxstyle='round,pad=0.3', facecolor=color, alpha=0.6),
           fontsize=11, fontweight='bold', color='white')

ax.set_title(f'Final Detection Results\n{len(final_predictions)} objects detected', 
            fontsize=16, fontweight='bold')
ax.axis('off')
plt.tight_layout()
plt.savefig('yolo_figures/final_predictions.png')


In [None]:
# Summary statistics
print("=" * 80)
print("YOLO DETECTION SUMMARY")
print("=" * 80)
print(f"\nImage: {image_path}")
print(f"Original size: {image.size[0]}x{image.size[1]}")
print(f"Preprocessed size: 640x640")

print(f"\nDetection Pipeline:")
print(f"  1. Raw predictions: {len(raw_predictions):,}")
print(f"  2. After confidence filtering (threshold=0.25): {len(filtered_by_confidence):,}")
print(f"  3. After Non-Maximum Suppression (IoU=0.45): {len(final_predictions):,}")

print(f"\nReduction Statistics:")
print(f"  Confidence filtering removed: {len(raw_predictions) - len(filtered_by_confidence):,} "
      f"predictions ({100*(1 - len(filtered_by_confidence)/len(raw_predictions)):.1f}%)")
print(f"  NMS removed: {len(filtered_by_confidence) - len(final_predictions):,} "
      f"duplicates ({100*(1 - len(final_predictions)/len(filtered_by_confidence)):.1f}%)")
print(f"  Total reduction: {len(raw_predictions) - len(final_predictions):,} "
      f"predictions ({100*(1 - len(final_predictions)/len(raw_predictions)):.1f}%)")

if len(final_predictions) > 0:
    print(f"\nDetected Objects:")
    for i, pred in enumerate(final_predictions):
        x1, y1, x2, y2, conf, cls_id = pred
        cls_id = int(cls_id)
        class_name = class_names.get(cls_id, f"class_{cls_id}")
        print(f"  {i+1}. {class_name}: confidence={conf:.3f}, "
              f"bbox=({x1:.0f}, {y1:.0f}, {x2:.0f}, {y2:.0f})")

print("\nKey Takeaways:")
print("- YOLO processes the entire image in one forward pass (single-stage detection)")
print("- Each grid cell predicts multiple boxes using anchor boxes")
print("- Confidence filtering removes low-quality predictions")
print("- NMS eliminates duplicate detections of the same object")
print("- Final results are clean, labeled bounding boxes")
print("=" * 80)
