Dataset: https://www.kaggle.com/datasets/andrewmvd/road-sign-detection

# **Prerequisites**

In [None]:
# Connecting GitHub for dataset if previously downloaded and transferred to Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Scripts containing some prerequisite functions
! wget "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py"
! wget "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py"
! wget "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py"
! wget "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py"
! wget "https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py"

In [None]:
# prerequisite libraries
!pip install torch torchvision albumentations
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image


In [None]:
# Converting XML annotation files to txt files (suitable for YOLO)
# Creat a folder named "ann_YOLO" in "Traffic_Sign_2" first.

import os
import xml.etree.ElementTree as ET

# Mapping categories to class IDs
categories = {'trafficlight': 0, 'stop': 1, 'speedlimit': 2, 'crosswalk': 3}

def convert(size, box):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[1]) / 2.0
    y = (box[2] + box[3]) / 2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    return x * dw, y * dh, w * dw, h * dh

def convert_annotation(xml_file, output_dir):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    size = root.find('size')
    width = int(size.find('width').text)
    height = int(size.find('height').text)

    output_file = os.path.join(output_dir, os.path.basename(xml_file).replace('.xml', '.txt'))
    with open(output_file, 'w') as out_file:
        for obj in root.iter('object'):
            cls = obj.find('name').text
            if cls not in categories:
                continue
            cls_id = categories[cls]
            xml_box = obj.find('bndbox')
            b = (float(xml_box.find('xmin').text), float(xml_box.find('xmax').text),
                 float(xml_box.find('ymin').text), float(xml_box.find('ymax').text))
            bb = convert((width, height), b)
            out_file.write(f"{cls_id} " + " ".join(map(str, bb)) + '\n')


# Convert all annotations
input_dir = "/content/drive/MyDrive/Traffic_Sign_2/annotations"
output_dir = "/content/drive/MyDrive/Traffic_Sign_2/ann_YOLO"
os.makedirs(output_dir, exist_ok=True)

for file in os.listdir(input_dir):
    if file.endswith(".xml"):
        convert_annotation(os.path.join(input_dir, file), output_dir)


In [None]:
# Splitting the dataset to train and test

import os
import shutil
import random

# Set paths
data_dir = "/content/drive/MyDrive/Traffic_Sign_2"
img_dir = os.path.join(data_dir, "images")
ann_dir = os.path.join(data_dir, "ann_YOLO")

# Create train/test directories
train_img_dir = os.path.join(data_dir, "YOLO/images/train")
train_ann_dir = os.path.join(data_dir, "YOLO/labels/train")
test_img_dir = os.path.join(data_dir, "YOLO/images/val")
test_ann_dir = os.path.join(data_dir, "YOLO/labels/val")

# Create the directories if they don't exist
os.makedirs(train_img_dir, exist_ok=True)
os.makedirs(train_ann_dir, exist_ok=True)
os.makedirs(test_img_dir, exist_ok=True)
os.makedirs(test_ann_dir, exist_ok=True)

# Get sorted lists of images and annotations
images = sorted(os.listdir(img_dir))
annotations = sorted(os.listdir(ann_dir))

# Check that each image has a corresponding annotation
assert len(images) == len(annotations), "Mismatch between image and annotation counts"

# Combine images and annotations into pairs
data_pairs = list(zip(images, annotations))

# Shuffle and split the data
random.seed(42)  # For identical randomness for every run
random.shuffle(data_pairs)
split_idx = int(0.8 * len(data_pairs))

train_pairs = data_pairs[:split_idx]
test_pairs = data_pairs[split_idx:]

# Move files into the corresponding directories
for img_name, ann_name in train_pairs:
    shutil.copy(os.path.join(img_dir, img_name), os.path.join(train_img_dir, img_name))
    shutil.copy(os.path.join(ann_dir, ann_name), os.path.join(train_ann_dir, ann_name))

for img_name, ann_name in test_pairs:
    shutil.copy(os.path.join(img_dir, img_name), os.path.join(test_img_dir, img_name))
    shutil.copy(os.path.join(ann_dir, ann_name), os.path.join(test_ann_dir, ann_name))

print("Dataset split complete!")


Dataset split complete!


# **Data Loader**

In [None]:
# Setting the custom dataloader function


import os
import xml.etree.ElementTree as ET
from PIL import Image
import torch
from torch.utils.data import Dataset


class Traffic_sign_dataset_2_custom_function(Dataset):

  def __init__(self, img_dir, ann_dir, transform = None):
    self.img_dir = img_dir
    self.ann_dir = ann_dir
    self.transform = transform
    self.imgs = sorted(os.listdir(img_dir))
    self.anns = sorted(os.listdir(ann_dir))


  def __len__(self):
    return len(self.imgs)


  def anno_parser(self, ann_path):
    tree = ET.parse(ann_path)
    root = tree.getroot()
    boxes = []
    labels = []
    string_lable_to_numerical = {
        'trafficlight': 0,
        'stop': 1,
        'speedlimit': 2,
        'crosswalk': 3
    }

    for obj in root.findall("object"):
      label = obj.find("name").text
      label_to_number = string_lable_to_numerical[label]  # You should apply opposite of this in the evaluation
      labels.append(label_to_number)
      bbox = obj.find("bndbox")
      xmin = int(bbox.find("xmin").text)
      ymin = int(bbox.find("ymin").text)
      xmax = int(bbox.find("xmax").text)
      ymax = int(bbox.find("ymax").text)
      boxes.append([xmin-1, ymin-2, xmax, ymax])
      numerical_and_torch_labels = torch.tensor(labels, dtype = torch.int64)
    return {"boxes": torch.tensor(boxes, dtype = torch.float32), "labels": numerical_and_torch_labels}


  def __getitem__(self, idx):
    img_path = os.path.join(self.img_dir, self.imgs[idx])
    ann_path = os.path.join(self.ann_dir, self.anns[idx])

    image = Image.open(img_path).convert('RGB')
    # imagenet_stats = np.array([[0.485, 0.456, 0.406], [0.229, 0.224, 0.225]])
    # image = (image - imagenet_stats[0]) / imagenet_stats[1]
    target = self.anno_parser(ann_path)

    # sections just for the COCO evaluator
    target['image_id'] = int(torch.tensor([idx]))
    boxes = target["boxes"]
    area = [(box[2] - box[0]) * (box[3] - box[1]) for box in boxes]
    target['area'] = torch.tensor(area, dtype=torch.float32)
    iscrowd = torch.zeros((len(boxes), ), dtype = torch.int64)
    target['iscrowd'] = iscrowd

    if self.transform is not None:
        image, target = self.transform(image, target)
    return image, target


In [None]:
# Transformation functions


import torchvision.transforms as T
from torchvision import transforms
from PIL import Image, ImageEnhance
import numpy as np
import random
from torchvision.transforms import functional as F


# Changing the contrast of the images
class ContrastEnhanceTransform:
    def __init__(self, contrast_factor=1):
        self.contrast_factor = contrast_factor
    def __call__(self, img):
        # Enhance contrast using PIL's ImageEnhance
        enhancer = ImageEnhance.Contrast(img)
        return enhancer.enhance(self.contrast_factor)


class CustomTransform:
    def __init__(self, size):
        self.size = size
        self.resize = T.Resize(size)

        self.image_only_transforms = T.Compose([
            T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            ContrastEnhanceTransform(contrast_factor=1.2)
        ])

        self.to_tensor = T.ToTensor()
        # self.normalize = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    def __call__(self, image, target):
        # # Random horizontal flip - Uncomment if you'd like to use it
        # if random.random() < 0.5:
        #     image = F.hflip(image)
        #     boxes = target["boxes"]
        #     w = image.width  # Original image width
        #     boxes[:, [0, 2]] = w - boxes[:, [2, 0]]  # Flip x-coordinates
        #     target["boxes"] = boxes

        # # Apply image-only transformations
        # image = self.image_only_transforms(image)

        # Resize image
        w_orig, h_orig = image.size
        image = self.resize(image)
        w_new, h_new = self.size

        # Calculate the scale factors
        scale_x = w_new / w_orig
        scale_y = h_new / h_orig

        # Resize bounding boxes
        boxes = target["boxes"]
        boxes[:, [0, 2]] *= scale_x
        boxes[:, [1, 3]] *= scale_y
        target["boxes"] = boxes

        # Convert to tensor
        image = self.to_tensor(image)
        # image = self.normalize(image)

        return image, target


In [None]:
# Setting data loaders

from torch.utils.data import DataLoader

transform = CustomTransform(size = (640, 640))


train_dataset = Traffic_sign_dataset_2_custom_function(
    img_dir = '/content/drive/MyDrive/Traffic_Sign_2/img_train',
    ann_dir = '/content/drive/MyDrive/Traffic_Sign_2/ann_train',
    transform = transform
)

test_dataset = Traffic_sign_dataset_2_custom_function(
    img_dir = '/content/drive/MyDrive/Traffic_Sign_2/img_test',
    ann_dir = '/content/drive/MyDrive/Traffic_Sign_2/ann_test',
    transform = transform
)

train_loader = DataLoader(train_dataset, batch_size = 4, shuffle = True, collate_fn=lambda x: tuple(zip(*x)))
test_loader = DataLoader(test_dataset, batch_size = 4, shuffle = False, collate_fn=lambda x: tuple(zip(*x)))


# **Visualization**

In [None]:
# Visualizing

import matplotlib.patches as patches
import matplotlib.pyplot as plt


def visualizing(img, annotation, ax):
    image = img.permute(1, 2, 0).numpy()
    ax.imshow(image)
    ax.set_aspect('equal')

    boxes = annotation['boxes']
    labels = annotation['labels']

    for i in range(len(boxes)):
        x1, y1, x2, y2 = boxes[i].int()

        rec = patches.Rectangle(
            (x1, y1),
            x2 - x1,
            y2 - y1,
            linewidth=1,
            edgecolor='r',
            facecolor='none'
        )
        ax.add_patch(rec)

        label = labels[i]
        string_label_to_numerical = {
            '0': 'traficlight',
            '1': 'stop',
            '2': 'speedlimit',
            '3': 'crosswalk'
        }

        label_string = string_label_to_numerical[f'{label}']

        ax.text(
            x1,
            y1 - 10,
            f"{label_string}",
            color='red',
            fontsize=8,
            weight='bold'
        )


In [None]:
images, targets = next(iter(train_loader))
fig, axs = plt.subplots(2, 2, figsize=(10, 10))
for i, ax in enumerate(axs.flatten()):
    image, target = images[i], targets[i]
    visualizing(image, target, ax)


# **Model Configuration (Faster R-CNN & SSD)**

In [None]:
# Adding the FPN network

import warnings
from typing import Callable, Dict, List, Optional, Union
from torch import nn, Tensor
from torchvision.ops import misc as misc_nn_ops
from torchvision.ops.feature_pyramid_network import ExtraFPNBlock, FeaturePyramidNetwork, LastLevelMaxPool
from torchvision import models
from torchvision.models import mobilenet, resnet
from torchvision.models._api import _get_enum_from_fn, WeightsEnum
from torchvision.models._utils import handle_legacy_interface, IntermediateLayerGetter


class BackboneWithFPN(nn.Module):
    def __init__(
        self,
        backbone: nn.Module,
        return_layers: Dict[str, str],
        in_channels_list: List[int],
        out_channels: int,
        extra_blocks: Optional[ExtraFPNBlock] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        super().__init__()
        if extra_blocks is None:
            extra_blocks = LastLevelMaxPool()

        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
        self.fpn = FeaturePyramidNetwork(
            in_channels_list=in_channels_list,
            out_channels=out_channels,
            extra_blocks=extra_blocks,
            norm_layer=norm_layer,
        )
        self.out_channels = out_channels

    def forward(self, x: Tensor) -> Dict[str, Tensor]:
        x = self.body(x)
        permuted_feature_maps = {str(i): torch.permute(x[k], (0, 3, 1, 2)) for i, k in enumerate(x)}
        x = self.fpn(permuted_feature_maps)
        return x


In [None]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
import torch.nn as nn
import torch
import torch.nn.functional as F


# ******************************************************************************
# resnet50 - a full object detection *******************************************
# ******************************************************************************

def get_model_FRCNNresnet50(num_classes):
  model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
  in_features = model.roi_heads.box_predictor.cls_score.in_features
  model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
  return model


# ******************************************************************************
# resnet101 *********************************************************************
# ******************************************************************************

def get_model_resnet101(num_classes):
    backbone = torchvision.models.resnet101(weights="DEFAULT")
    backbone_features = nn.Sequential(*list(backbone.children())[:-1])
    backbone_features.out_channels = 2048
    model = FasterRCNN(
        backbone=backbone_features,
        num_classes=num_classes,
    )
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    model.roi_heads.box_predictor.cls_loss_func = lambda logits, targets: custom_loss_function(logits, targets, class_weights)
    return model


# ******************************************************************************
# ResNet50 + fpn ***************************************************************
# ******************************************************************************

from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
def get_model_custom_resnet50fpn(num_classes):
    # Load a pre-trained model for classification and return only the features
    backbone = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights='DEFAULT').backbone
    # backbone = resnet_fpn_backbone(backbone_name="resnet50", pretrained=True)
    backbone.out_channels = 256
    anchor_generator = AnchorGenerator(
        sizes = ((32, 64, 128, 256, 512),) * 5,
        aspect_ratios=(0.5, 1.0, 2.0),  # Same aspect ratios for all levels
)
    # Define the region of interest (RoI) pooling
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0','1','2','3','pool'],  # Match the FPN feature map names
        output_size=7,
        sampling_ratio=2
    )
    # Create the model
    model = FasterRCNN(
        backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )
    # Replace the classifier head
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model


# ******************************************************************************
# Swin-T + fpn (with 3 channel input)*******************************************
# ******************************************************************************

import numpy as np
class SwinBackboneWithFPN3channel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = torchvision.models.swin_t(weights='DEFAULT').features
        self.return_layers = {'1': '0', '3': '1', '5': '2', '7': '3'}
        self.in_channels_list = [96, 192, 384, 768]
        self.out_channels = 256
        self.fpn = BackboneWithFPN(self.backbone,
                                   self.return_layers,
                                   self.in_channels_list,
                                   self.out_channels)
    def forward(self, x):
        return self.fpn(x)

def get_model_swin_t_fpn(num_classes):
    backbone = SwinBackboneWithFPN3channel()
    anchor_generator = AnchorGenerator(
        sizes = ((32, 64, 128, 256, 512),) * 5,
        aspect_ratios = ((0.5, 1.0, 2.0),) * 5    )
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names = ['0', '1', '2', '3', '4'],
        output_size = 7,
        sampling_ratio = 2)
    model = FasterRCNN(
        backbone,
        num_classes = num_classes,
        rpn_anchor_generator = anchor_generator,
        box_roi_pool=roi_pooler)
    # Replace the classifier head
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    box_predictor = FastRCNNPredictor(in_features, num_classes)
    # To compensate for the class imbalance problem (If use a different dataset, then need use different valus here)
    class_weights = torch.tensor([7.1871, 14.2714,  1.6061,  5.9464], device=device)
    box_predictor.cls_loss_func = torch.nn.CrossEntropyLoss(weight = class_weights)
    model.roi_heads.box_predictor = box_predictor

    return model


# ******************************************************************************
# swin-t 512 1 channel (without FPN) *******************************************
# ******************************************************************************

class SwinBackboneWith1channel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = torchvision.models.swin_t(weights="DEFAULT").features
        # Modifying the first layer of the model to receive images with one channel
        first_conv_layer = self.backbone[0][0]
        new_first_conv_layer = nn.Conv2d(
            in_channels = 1,  # The number of input image channels
            out_channels = first_conv_layer.out_channels,
            kernel_size = first_conv_layer.kernel_size,
            stride = first_conv_layer.stride,
            padding = first_conv_layer.padding,
            bias = first_conv_layer.bias is not None        )
        # Copy the original weights, average them to adapt to the single channel input
        with torch.no_grad():
            new_first_conv_layer.weight = nn.Parameter(
                first_conv_layer.weight.mean(dim = 1, keepdim = True))
            if first_conv_layer.bias is not None:
                new_first_conv_layer.bias = nn.Parameter(first_conv_layer.bias)
        # Replace the original conv layer with the new one
        self.backbone[0][0] = new_first_conv_layer
        self.out_channels = 768

    def forward(self, x):
      return torch.permute(self.backbone(x), (0, 3, 1, 2))

def get_model_swin_t_1channel(num_classes):
  backbone_swint = torchvision.models.swin_t(weights="DEFAULT").features
  anchor_generator = AnchorGenerator(
      sizes = ((32, 64, 128, 256, 512),),
      aspect_ratios = ((0.5, 1.0, 2.0),))
  roi_pooler = torchvision.ops.MultiScaleRoIAlign(
      featmap_names = ['0'],
      output_size = 7,
      sampling_ratio = 2)
  swin_backbone = SwinBackboneWith1channel()
  model = FasterRCNN(
      swin_backbone,
      num_classes = 4,
      rpn_anchor_generator = anchor_generator,
      box_roi_pool = roi_pooler,
      min_size = 224,
      image_mean = [0.5],
      image_std = [0.5]
      )
  in_features = model.roi_heads.box_predictor.cls_score.in_features
  model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
  return model


# ******************************************************************************
# Swin Transformer *************************************************************
# ******************************************************************************

class SwinTransformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = torchvision.models.swin_t(weights="DEFAULT").features
        self.out_channels = 768
    def forward(self, x):
      return torch.permute(self.backbone(x), (0, 3, 1, 2))

def get_model_swin_t(num_classes):
  anchor_generator = AnchorGenerator(
      sizes = ((32, 64, 128, 256, 512),),
      aspect_ratios = ((0.5, 1.0, 2.0),))
  roi_pooler = torchvision.ops.MultiScaleRoIAlign(
      featmap_names = ['0'],
      output_size = 7,
      sampling_ratio = 2)
  swin_backbone = SwinTransformer()
  model = FasterRCNN(
      swin_backbone,
      num_classes = 2,
      rpn_anchor_generator = anchor_generator,
      box_roi_pool = roi_pooler,
      min_size = 300,
      )
  in_features = model.roi_heads.box_predictor.cls_score.in_features

  box_predictor = FastRCNNPredictor(in_features, num_classes)
  # To compensate for the class imbalance problem (If use a different dataset, then need use different valus here)
  class_weights = torch.tensor([7.1871, 14.2714,  1.6061,  5.9464], device=device)
  box_predictor.cls_loss_func = torch.nn.CrossEntropyLoss(weight = class_weights)
  model.roi_heads.box_predictor = box_predictor

  return model



# ******************************************************************************
# ResNet 50 + imbalance data ***************************************************
# ******************************************************************************


import torch
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# Function to compute class weights
def compute_class_weights(class_counts):
    total = sum(class_counts)
    num_classes = len(class_counts)
    return torch.tensor([total / (num_classes * c) for c in class_counts], dtype=torch.float32)

def get_model_FRCNNresnet50_with_weights(num_classes):
    # Load the pre-trained Faster R-CNN model
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

    # Replace the classifier with a new one
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    box_predictor = FastRCNNPredictor(in_features, num_classes)

    class_weights = torch.tensor([7.1871, 14.2714,  1.6061,  5.9464], device=device)
    box_predictor.cls_loss_func = torch.nn.CrossEntropyLoss(weight = class_weights)
    model.roi_heads.box_predictor = box_predictor

    return model


# ******************************************************************************
# SSD-VGG16 ********************************************************************
# ******************************************************************************

import torch
from torchvision.models.detection.ssd import ssd300_vgg16
from torch.utils.data import DataLoader
from torch.optim import SGD
from torchvision.models.detection.transform import GeneralizedRCNNTransform
from torch.nn import CrossEntropyLoss


def get_model_SSD_VGG16(num_classes, imbalance=True):
    model = ssd300_vgg16(pretrained=True)
    model.head.classification_head.num_classes = num_classes
    return model


# **Training and Testing**


In [None]:
# COCO Evaluation function


from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import numpy as np
import torch


def get_coco_api_from_dataset2(dataset):
    coco = COCO()
    coco.dataset = {
        'images': [],
        'annotations': [],
        'categories': []
    }

    ann_id = 1

    for i in range(len(dataset)):
        img, target = dataset[i]
        temp_image_id = i
        coco.dataset['images'].append({
            'id': temp_image_id
        })

        boxes = target["boxes"]
        labels = target["labels"]
        image_id = target['image_id']
        areas = target['area']
        iscrowd = target['iscrowd']

        for box, label, area, crowd in zip(boxes, labels, areas, iscrowd):
            coco.dataset['annotations'].append({
                'id': ann_id,  # Add unique ID for each annotation
                'image_id': image_id,
                'bbox': box.tolist(),
                'category_id': int(label),
                'area': float(area),
                'iscrowd': int(crowd)
            })

            ann_id += 1
    coco.createIndex()

    return coco


def evaluate_model2(model, data_loader, device, small, medium, large):
    model.eval()
    coco = get_coco_api_from_dataset2(data_loader.dataset)
    coco.dataset['categories'] = [
    {"id": 0, "name": "trafficlight"},
    {"id": 1, "name": "stop"},
    {"id": 2, "name": "speedlimit"},
    {"id": 3, "name": "crosswalk"}
    ]
    coco_results = []
    with torch.no_grad():
      for images, targets in data_loader:
          images = list(img.to(device) for img in images)
          outputs = model(images)
          for target, output in zip(targets, outputs):
              image_id = int(target["image_id"])
              boxes = output["boxes"].cpu()
              scores = output["scores"].cpu()
              labels = output["labels"].cpu()
              for box, score, label in zip(boxes, scores, labels):
                  coco_results.append({
                      'image_id': image_id,
                      'category_id': int(label),
                      'bbox': box.tolist(),
                      'score': float(score)
                  })

    coco_dt = coco.loadRes(coco_results)
    coco_eval = COCOeval(coco, coco_dt, 'bbox')


    coco_eval.params.maxDets = [1, 10, 20]
    coco_eval.params.areaRng = [
        [0, large],
        [0, small],
        [small + 0.1, medium],
        [medium + 0.1, large]
    ]

    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()

    precisions = coco_eval.eval['precision']
    recalls = coco_eval.eval['recall']
    iou_thresh = [0.5, 0.75, 0.95]
    max_dets = [1, 10, 100]

    area_indices = {
        'all': 0,
        'small': 1,
        'medium': 2,
        'large': 3
    }

    for area_name, area_idx in area_indices.items():
        print(f"\nPrecision and Recall for area: {area_name.capitalize()}")
        for iou_idx, iou in enumerate(iou_thresh):
            for det_idx, det in enumerate(max_dets):
                precision = precisions[iou_idx, :, :, area_idx, det_idx]
                recall = recalls[iou_idx, :, area_idx, det_idx]

                # Calculating mAR and mAP
                if precision.size > 0:
                    ap = np.mean(precision[precision > -1]) # This excludes not calculated precisions (invalid values) before calculating the mean.
                else:
                    ap = -1
                if recall.size > 0:
                    ar = np.mean(recall[recall > -1])
                else:
                    ar = -1
                print(f'@[ IoU={iou:.2f} | maxDets={det} ] -> AP: {ap:.3f}, AR: {ar:.3f}')

# AP and AR
    print("\nDetailed results over all IoU thresholds [0.5:0.95] for each condition:")
    for area_name, area_idx in area_indices.items():
        print(f"\nArea: {area_name.capitalize()}")
        for det_idx, det in enumerate(max_dets):
            # Precision
            precision = precisions[:, :, :, area_idx, det_idx]
            if precision.size > 0:
                ap = np.mean(precision[precision > -1])
            else:
                ap = -1
            print(f'Precision @[ IoU=0.5:0.95 | maxDets={det} ] -> AP: {ap:.3f}')

            # Recall
            recall = recalls[:, :, area_idx, det_idx]
            if recall.size > 0:
                ar = np.mean(recall[recall > -1])
            else:
                ar = -1
            print(f'Recall @[ IoU=0.5:0.95 | maxDets={det} ] -> AR: {ar:.3f}')


    return coco_eval.stats


# Enter the 3 percentiles of your object area here for the evaluation function
# One needs to find these values specifically from their dataset
percentile_small = 2080.00
percentile_medium = 5939.66
percentile_large = 110407.27


In [None]:
# Train_one_Epoch

import math
import sys
import time
import torch
import torchvision.models.detection.mask_rcnn
import utils
from coco_eval import CocoEvaluator
from coco_utils import get_coco_api_from_dataset

def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, scaler=None):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
    header = f"Epoch: [{epoch}]"
    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)
        lr_scheduler = torch.optim.lr_scheduler.LinearLR(
            optimizer, start_factor=warmup_factor, total_iters=warmup_iters
        )
    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]
        with torch.cuda.amp.autocast(enabled=scaler is not None):
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        loss_value = losses_reduced.item()
        if not math.isfinite(loss_value):
            sys.exit(1)
        optimizer.zero_grad()
        if scaler is not None:
            scaler.scale(losses).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            losses.backward()
            optimizer.step()
        if lr_scheduler is not None:
            lr_scheduler.step()
        # Ensure there is no conflict with 'loss' key
        metric_logger.update(**{k: v for k, v in loss_dict_reduced.items() if k != 'loss'})
        metric_logger.update(loss=losses_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
    return metric_logger



In [None]:
# Training the model
# The model will be evaluated after each epoch with the abovementioned COCO evaluator

from engine import _get_iou_types, evaluate
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR

# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# get the model - Class+1
model = get_model_SSD_VGG16(5)
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.0008,
    momentum=0.9,
    weight_decay=0.0005)

# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=8,
    gamma=0.1)

num_epochs = 20

for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq = 15)
    lr_scheduler.step()
    model.eval()
    with torch.no_grad():
      for images, targets in test_loader:
        images = list(img.to(device) for img in images)
        outputs = model(images)
    evaluate_model2(model, test_loader, device, percentile_small, percentile_medium, percentile_large)



In [None]:
# Saving the trained model in a specified directory
import torchvision
import torch

# Save the model
torch.save(model.state_dict(), '/content/drive/MyDrive/traffic_sign_swin-t+FPN_3Channel_300_12epoch.pth')
