.

## VAISHNAVI R 23MSD7028

# Object Detection with SSD (Single Shot Detector)¶
### Q)Train an SSD model on the Pascal VOC dataset to detect multiple objects in an image. Compare its performance with other object detection model

The Single Shot Multibox Detector (SSD) is an efficient and fast object detection model designed to predict object classes and bounding box locations in a single pass through the network. Unlike traditional object detection models that require multiple stages, SSD performs both tasks—classification and localization—simultaneously, using a single deep neural network. The model employs convolutional layers at different scales to predict objects of various sizes, making it particularly effective for detecting objects in images with varying dimensions. SSD is known for its balance between speed and accuracy, making it suitable for real-time applications such as video processing and autonomous driving. It can be trained on large-scale datasets like Pascal VOC and MS COCO and is capable of detecting multiple objects within an image.

### In this code, a custom object detection model is trained using the SSD (Single Shot Multibox Detector) architecture with MobileNetV3 as the backbone on the Pascal VOC dataset. The goal of the task is to train and evaluate a model that detects objects within images and outputs bounding boxes around the detected objects. This approach involves using PyTorch, torchvision, and a custom dataset class to load images and their corresponding annotations. The training process uses a standard optimization routine, and the model is evaluated using the Intersection over Union (IoU) metric to determine its performance.

In [1]:
import os
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision.models.detection import ssdlite320_mobilenet_v3_large
import xml.etree.ElementTree as ET
import numpy as np
from torchvision.transforms import functional as F

# Define a custom dataset to load VOC data
class VOCDataset(Dataset):
    def __init__(self, image_folder, label_folder, transform=None):
        self.image_folder = image_folder
        self.label_folder = label_folder
        self.image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg')]
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_folder, img_name)
        label_path = os.path.join(self.label_folder, img_name.replace('.jpg', '.xml'))

        # Load the image
        image = Image.open(img_path).convert("RGB")

        # Parse the annotation file (Pascal VOC XML format)
        target = self.parse_voc_annotation(label_path)

        if self.transform:
            image, target = self.transform(image, target)

        return image, target

    def parse_voc_annotation(self, xml_path):
        tree = ET.parse(xml_path)
        root = tree.getroot()

        boxes = []
        labels = []
        for obj in root.iter('object'):
            label = obj.find('name').text
            bndbox = obj.find('bndbox')
            xmin = int(bndbox.find('xmin').text)
            ymin = int(bndbox.find('ymin').text)
            xmax = int(bndbox.find('xmax').text)
            ymax = int(bndbox.find('ymax').text)
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(self.get_class_index(label))

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        target = {'boxes': boxes, 'labels': labels}
        return target

    def get_class_index(self, label):
        classes = ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
                   'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant',
                   'sheep', 'sofa', 'train', 'tvmonitor']
        return classes.index(label)

# Define transformations for input
def transform(image, target):
    image = F.to_tensor(image)
    target['boxes'] = target['boxes'] / torch.tensor([image.shape[1], image.shape[0], image.shape[1], image.shape[0]])
    return image, target

# Define paths
image_train_dir = r'C:\Users\USER\Downloads\VOCtrainval_06-Nov-2007\VOCdevkit\VOC2007\JPEGImages'
label_train_dir = r'C:\Users\USER\Downloads\VOCtrainval_06-Nov-2007\VOCdevkit\VOC2007\Annotations'

# Initialize the datasets
train_dataset = VOCDataset(image_train_dir, label_train_dir, transform=transform)

# DataLoader setup
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

# Initialize SSD model without pretrained weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ssdlite320_mobilenet_v3_large(pretrained=False).to(device)

# Setup the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9, weight_decay=1e-4)

# Training loop
num_epochs = 10  # Define the number of epochs
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    for images, targets in train_loader:
        images = [image.to(device) for image in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        loss_dict = model(images, targets)

        # Total loss
        losses = sum(loss for loss in loss_dict.values())
        epoch_loss += losses.item()

        # Backward pass and optimization
        losses.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader)}")

# Save the trained model
torch.save(model.state_dict(), "ssd_model_trained_from_scratch.pth")
print("Model trained from scratch and saved as 'ssd_model_trained_from_scratch.pth'")




Epoch [1/10], Loss: 14.860403864767752
Epoch [2/10], Loss: 8.635255828808749
Epoch [3/10], Loss: 8.09644184918685
Epoch [4/10], Loss: 7.7735957589826326
Epoch [5/10], Loss: 7.45649728592503
Epoch [6/10], Loss: 7.172491946669096
Epoch [7/10], Loss: 6.990846730495374
Epoch [8/10], Loss: 6.7214201523356465
Epoch [9/10], Loss: 6.463100740593966
Epoch [10/10], Loss: 6.229909229886969
Model trained from scratch and saved as 'ssd_model_trained_from_scratch.pth'


In [3]:
image_test_dir = r'C:\Users\USER\Downloads\VOCtest_06-Nov-2007\VOCdevkit\VOC2007\JPEGImages'
label_test_dir = r'C:\Users\USER\Downloads\VOCtest_06-Nov-2007\VOCdevkit\VOC2007\Annotations'

# Initialize the test dataset
test_dataset = VOCDataset(image_test_dir, label_test_dir, transform=transform)

# DataLoader setup for test
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))


In [4]:
model = ssdlite320_mobilenet_v3_large(pretrained=False).to(device)
model.load_state_dict(torch.load("ssd_model_trained_from_scratch.pth"))
model.eval()  # Set the model to evaluation mode


  model.load_state_dict(torch.load("ssd_model_trained_from_scratch.pth"))


SSD(
  (backbone): SSDLiteFeatureExtractorMobileNet(
    (features): Sequential(
      (0): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (2): Hardswish()
        )
        (1): InvertedResidual(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
              (2): ReLU(inplace=True)
            )
            (1): Conv2dNormActivation(
              (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            )
          )
        )
        (2): Invert

.

In [6]:
from torchvision.ops import box_iou
import numpy as np

# Function to evaluate the model
def evaluate_model(model, data_loader):
    model.eval()
    iou_scores = []
    with torch.no_grad():
        for images, targets in data_loader:
            images = [image.to(device) for image in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            # Get predictions
            predictions = model(images)

            # Calculate IoU for each image in the batch
            for i, prediction in enumerate(predictions):
                pred_boxes = prediction['boxes']
                true_boxes = targets[i]['boxes']

                if len(pred_boxes) > 0 and len(true_boxes) > 0:
                    # Calculate IoU between predicted and true boxes
                    iou = box_iou(pred_boxes, true_boxes)

                    # Take the maximum IoU for each predicted box (best matching true box)
                    max_iou_per_pred_box = iou.max(dim=1).values
                    iou_scores.extend(max_iou_per_pred_box.cpu().numpy())

    # Compute mean IoU across all images and boxes
    return np.mean(iou_scores) if iou_scores else 0.0

# Evaluate the model
mean_iou = evaluate_model(model, test_loader)
print(f"Mean IoU on the test dataset: {mean_iou}")


Mean IoU on the test dataset: 0.037109922617673874


.

## CONCLUSION

### Dataset and DataLoader:
The VOCDataset class loads images and annotations from the Pascal VOC dataset.
Annotations in XML format are parsed to extract bounding boxes and object labels, which are then used to train the model.
The dataset is loaded in batches using DataLoader, which shuffles the data and prepares batches for training.
### Transformations:
Images are converted to tensors using F.to_tensor(), and bounding box coordinates are normalized by dividing by image width and height.
### Model Training:
The SSD model (ssdlite320_mobilenet_v3_large) is used for object detection, which has been initialized without pre-trained weights.
The model is trained for 10 epochs, and the loss is calculated for each batch. The loss is the sum of all losses (e.g., classification loss, localization loss).
The optimizer (SGD) updates the model's weights based on the loss.
### Model Evaluation:
After training, the model is evaluated using the test dataset.
The evaluation function computes IoU by comparing the predicted bounding boxes with the ground truth.
The IoU values for each image are averaged to compute the Mean IoU.

This code demonstrates how to train an SSD object detection model from scratch using the Pascal VOC dataset. By defining a custom dataset class and setting up an appropriate training loop with the SSD model, the code efficiently handles object detection tasks. After training, the model is evaluated using the mean IoU metric, which provides an indication of the accuracy of the predicted bounding boxes.

The evaluation result shows that the model's Mean Intersection over Union (IoU) on the test dataset is 0.0371. This is quite low, indicating that the model's predictions for bounding box overlap with the ground truth boxes are not very accurate.

## Compare SSD  performance with other object detection model

### SSD (Single Shot Multibox Detector)
 Speed SSD is known for its fast inference speed, capable of processing up to 50-60 frames per second (FPS) depending on the hardware and model configuration
 Accuracy:SSD typically achieves mean Average Precision (mAP) of around 20-30% on the VOC dataset and 25-35% on the COCO dataset. While it's efficient, it is less accurate than some of the other models, particularly when it comes to detecting small objects.
 Strengths:
Excellent for real-time applications due to its speed. Suitable for applications where inference speed is more critical than absolute accuracy, such as in video processing.
 Weaknesses:
Lower accuracy, especially for small objects, compared to models like Faster R-CNN or RetinaNet.

### Faster R-CNN
Speed:
Faster R-CNN is slower than SSD, typically achieving around 10-15 FPS due to the Region Proposal Network (RPN) used for object proposals.
 Accuracy:
Faster R-CNN typically achieves mAP of 30-40% on VOC and 35-45% on COCO. This model tends to have higher accuracy, especially for small objects, as it uses a two-stage process to refine the proposed regions.
 Strengths:
High accuracy, particularly for small objects, and good general object detection performance.
Weaknesses:
Slower inference time makes it less suitable for real-time applications.

###  YOLO (You Only Look Once)
 Speed:
YOLO is known for its extreme speed, often achieving 50-100 FPS depending on the version (YOLOv3, YOLOv4, etc.) and hardware.
Accuracy:
YOLO typically achieves an mAP of 30-40% on VOC and 35-45% on COCO. While its accuracy is good, especially for medium to large objects, it doesn't perform as well on small objects compared to Faster R-CNN or RetinaNet.
Strengths:
Extremely fast, making it ideal for real-time applications such as autonomous driving, robotics, and live video processing.
 Weaknesses: 
Struggles with detecting small objects, which is a disadvantage in some use cases.

###  RetinaNet
 Speed: 
RetinaNet is slower than SSD and YOLO, with inference speeds around 20-30 FPS, due to the feature pyramid network (FPN) used to detect objects at different scales.
Accuracy:
RetinaNet typically achieves mAP of 35-40% on VOC and 40-45% on COCO. It outperforms SSD in terms of accuracy, especially for small objects, due to its focal loss function that helps address the class imbalance issue.
 Strengths:
Higher accuracy, particularly for small objects. It performs better than SSD in terms of handling difficult cases, including objects of various scales.
 Weaknesses:
Slower inference speed compared to SSD and YOLO, making it less suitable for applications requiring real-time performance.

### Summary Comparison:
Speed: 
SSD and YOLO are the fastest models, making them suitable for real-time applications. YOLO is typically faster, but SSD offers a good trade-off between speed and accuracy.
 Accuracy:
Faster R-CNN and RetinaNet tend to outperform SSD in terms of accuracy, especially for small objects. RetinaNet stands out with its ability to handle small objects effectively due to its focal loss, while Faster R-CNN is very accurate overall but slower.
 Real-time Application: 
For tasks that require real-time object detection, SSD and YOLO are preferred due to their higher FPS. Faster R-CNN and RetinaNet are more accurate but not as efficient for real-time use cases.

#### SSD is a great model when real-time performance is the priority, but it may not be as accurate as Faster R-CNN, YOLO, or RetinaNet, especially for small object detection. If higher accuracy and better handling of small objects are required, Faster R-CNN and RetinaNet would be more appropriate, with RetinaNet being particularly useful for cases where small object detection is critical. YOLO balances both speed and accuracy but might not be the best at handling small objects.

## TensorFlow's SSD (Single Shot Multibox Detector) model
 
 In this code, I have used TensorFlow's SSD (Single Shot Multibox Detector) model, which is a fast and efficient object detection model, to detect objects within images. The SSD model has been pre-trained on a large dataset, specifically Open Images V4, and is available via TensorFlow Hub. By loading the SSD model, it can detect objects in any given image, draw bounding boxes around the detected objects, and display the results. This approach makes it suitable for various applications such as image classification, object tracking, and even real-time video analysis

In [2]:
import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [3]:
from PIL import Image, ImageDraw, ImageColor, ImageOps, ImageFont
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np

# Load the SSD model from TensorFlow Hubr
module_handle = "https://tfhub.dev/google/openimages_v4/ssd/mobilenet_v2/1"
detector = hub.load(module_handle).signatures['default']

# Function to resize an image from a local path
def load_and_resize_image(path, new_width=369, new_height=215, display=False):
    pil_image = Image.open(path)
    pil_image = ImageOps.fit(pil_image, (new_width, new_height), Image.ANTIALIAS)
    pil_image_rgb = pil_image.convert("RGB")
    if display:
        display_image(pil_image_rgb)
    return np.array(pil_image_rgb)

# Function to display the image
def display_image(image):
    plt.figure(figsize=(20, 15))
    plt.grid(False)
    plt.imshow(image)
    plt.show()

# Function to draw bounding boxes on the image
def draw_bounding_box_on_image(image, ymin, xmin, ymax, xmax, color, font, thickness=4, display_str_list=()):
    draw = ImageDraw.Draw(image)
    im_width, im_height = image.size
    (left, right, top, bottom) = (xmin * im_width, xmax * im_width, ymin * im_height, ymax * im_height)
    draw.line([(left, top), (left, bottom), (right, bottom), (right, top), (left, top)], width=thickness, fill=color)

    for display_str in display_str_list:
        text_width, text_height = font.getsize(display_str)
        margin = np.ceil(0.05 * text_height)
        draw.rectangle([(left, top - text_height - 2 * margin), (left + text_width, top)], fill=color)
        draw.text((left + margin, top - text_height - margin), display_str, fill="black", font=font)

def draw_boxes(image, boxes, class_names, max_boxes=10, min_score=0.1):
    colors = list(ImageColor.colormap.values())
    font = ImageFont.load_default()
    for i in range(min(boxes.shape[0], max_boxes)):
        ymin, xmin, ymax, xmax = tuple(boxes[i])
        display_str = class_names[i].decode("ascii")
        color = colors[hash(class_names[i]) % len(colors)]
        image_pil = Image.fromarray(np.uint8(image)).convert("RGB")
        draw_bounding_box_on_image(image_pil, ymin, xmin, ymax, xmax, color, font, display_str_list=[display_str])
        np.copyto(image, np.array(image_pil))
    return image

# SSD function to perform detection on the resized image
def ssd(detector, path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    converted_img = tf.image.convert_image_dtype(img, tf.float32)[tf.newaxis, ...]
    result = detector(converted_img)
    result = {key: value.numpy() for key, value in result.items()}
    result_image = draw_boxes(img.numpy(), result["detection_boxes"], result["detection_class_entities"])
    display_image(result_image)

# Specify the path to the image in Pascal VOC 2007
image_path = r'C:\Users\USER\Downloads\VOCtrainval_06-Nov-2007\VOCdevkit\VOC2007\JPEGImages\000030.jpg'
ssd(detector, image_path)




ModuleNotFoundError: No module named 'matplotlib.artist'