### Install & Import Dependencies
- fasterrcnn_resnet50_fpn → ResNet-50 backbone
- Dataset → custom COCO dataset loader
- PIL.Image → image loading
- torchvision.transforms.functional → image preprocessing

In [1]:
# Core
import os
import json
import torch
import numpy as np

# PyTorch vision
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights

# Utilities
from PIL import Image
from torch.utils.data import Dataset, DataLoader

### Verify GPU (Highly Recommended)
- Faster R-CNN is very slow on CPU
- CUDA is strongly recommended

In [2]:
device = torch.device("cuda")
# print(torch.version.cuda)
device

device(type='cuda')

### COCO Dataset Loader (Custom)
- Reads your COCO JSON
- Converts bbox from [x, y, w, h] → [x1, y1, x2, y2]
- Converts category IDs → contiguous labels
- Returns data in exact format Faster R-CNN expects

In [9]:
class COCODetectionDataset(Dataset):
    def __init__(self, images_dir, annotation_file, transforms=None):
        self.images_dir = images_dir
        self.transforms = transforms

        with open(annotation_file, 'r') as f:
            coco = json.load(f)

        self.images = coco["images"]
        self.annotations = coco["annotations"]
        self.categories = coco["categories"]

        # Map image_id → annotations
        self.img_to_anns = {}
        for ann in self.annotations:
            self.img_to_anns.setdefault(ann["image_id"], []).append(ann)

        # Category ID mapping (COCO expects labels ≥ 1)
        self.cat_id_to_label = {
            cat["id"]: i + 1 for i, cat in enumerate(self.categories)
        }

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_info = self.images[idx]
        img_path = os.path.join(self.images_dir, img_info["file_name"])

        image = Image.open(img_path).convert("RGB")

        anns = self.img_to_anns.get(img_info["id"], [])

        boxes = []
        labels = []

        for ann in anns:
            x, y, w, h = ann["bbox"]
            boxes.append([x, y, x + w, y + h])
            labels.append(self.cat_id_to_label[ann["category_id"]])

        if len(boxes) == 0:
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
        else:
            boxes = torch.tensor(boxes, dtype=torch.float32)
            labels = torch.tensor(labels, dtype=torch.int64)

        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([img_info["id"]])
        }

        if self.transforms:
            image = self.transforms(image)

        return image, target


### Image Transformations
- Faster R-CNN expects tensor images
- Normalization is handled internally

In [10]:
def get_transform():
    def transform(image):
        image = F.to_tensor(image)
        return image
    return transform


### Dataset & DataLoader
- collate_fn is required because images have different numbers of boxes
- Batch size depends on GPU memory

In [11]:
# weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT

dataset = COCODetectionDataset(
    images_dir="../dashcam 2.v1i.coco/train",
    annotation_file="../dashcam 2.v1i.coco/train/_annotations.coco.json",
    transforms=get_transform()
)

def collate_fn(batch):
    return tuple(zip(*batch))

data_loader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=collate_fn
)


### Load ResNet-based Model
- Uses ResNet-50 + FPN
- pretrained=True → transfer learning
- COCO requires background class

In [12]:
num_classes = len(dataset.categories) + 1  # + background

model = fasterrcnn_resnet50_fpn(
    weights=None,
    num_classes=num_classes
)

model.to(device)


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
          (relu

### Optimizer & Learning Rate Scheduler
- Standard Faster R-CNN training setup
- LR drops every 3 epochs

In [13]:
params = [p for p in model.parameters() if p.requires_grad]

optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)


### Training Loop
- Faster R-CNN returns losses automatically
- Includes:
    - Classification loss
    - Box regression loss
    - RPN losses

In [15]:
num_epochs = 10

from tqdm import tqdm

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    loop = tqdm(data_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for images, targets in loop:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        epoch_loss += losses.item()

        loop.set_postfix(loss=losses.item())

    lr_scheduler.step()


Epoch 1/10:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 1/10: 100%|██████████| 621/621 [24:44<00:00,  2.39s/it, loss=0.197] 
Epoch 2/10: 100%|██████████| 621/621 [23:04<00:00,  2.23s/it, loss=0.359] 
Epoch 3/10: 100%|██████████| 621/621 [22:12<00:00,  2.15s/it, loss=0.19]  
Epoch 4/10: 100%|██████████| 621/621 [23:19<00:00,  2.25s/it, loss=0.264] 
Epoch 5/10: 100%|██████████| 621/621 [22:43<00:00,  2.20s/it, loss=0.103] 
Epoch 6/10: 100%|██████████| 621/621 [22:56<00:00,  2.22s/it, loss=0.162] 
Epoch 7/10: 100%|██████████| 621/621 [21:47<00:00,  2.11s/it, loss=0.377] 
Epoch 8/10: 100%|██████████| 621/621 [22:28<00:00,  2.17s/it, loss=0.0801]
Epoch 9/10: 100%|██████████| 621/621 [25:07<00:00,  2.43s/it, loss=0.0802]
Epoch 10/10: 100%|██████████| 621/621 [24:33<00:00,  2.37s/it, loss=0.178] 


### Save Model
- Saves trained weights
- Can be reloaded for inference

In [16]:
torch.save(model.state_dict(), "resnet50_fasterrcnn_coco.pth")


### Inference Example
Returns:
```
{
  'boxes': Tensor[N, 4],
  'labels': Tensor[N],
  'scores': Tensor[N]
}
```

In [21]:
model.eval()

image = Image.open("test.jpg").convert("RGB")
image_tensor = F.to_tensor(image).to(device)

with torch.no_grad():
    predictions = model([image_tensor])

predictions


[{'boxes': tensor([[ 90.1564, 146.2932, 175.0182, 254.1304],
          [ 39.3504, 166.3938,  58.2443, 186.6581],
          [252.1730, 132.3670, 268.6006, 207.6369],
          [  0.0000, 165.3796,  18.7130, 195.4951],
          [ 17.0134, 165.2456,  40.8713, 189.3596],
          [242.1162, 136.1477, 260.4969, 210.8177],
          [ 16.0962, 165.3455,  65.4950, 187.6825],
          [ 20.8265, 164.2278,  32.8797, 189.8170],
          [  0.0000, 165.2081,  49.8502, 190.9638],
          [ 48.9257, 167.0788,  58.3166, 184.3145],
          [ 28.0014, 166.7988,  40.3196, 188.4020],
          [244.8121, 141.3969, 273.7494, 184.9867],
          [ 33.7867, 167.9418,  42.7584, 187.8347],
          [242.9547, 131.6259, 273.9514, 217.3199]], device='cuda:0'),
  'labels': tensor([2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4], device='cuda:0'),
  'scores': tensor([0.9992, 0.9969, 0.9731, 0.9533, 0.8274, 0.7553, 0.7487, 0.2934, 0.2405,
          0.1655, 0.1097, 0.1036, 0.0717, 0.0679], device='cuda:0')}]

In [17]:
def evaluate_coco(model, data_loader, annotation_file):
    model.eval()

    from pycocotools.coco import COCO
    from pycocotools.cocoeval import COCOeval

    coco_gt = COCO(annotation_file)
    coco_results = []

    with torch.no_grad():
        for images, targets in data_loader:
            images = [img.to(device) for img in images]
            outputs = model(images)

            for output, target in zip(outputs, targets):
                image_id = int(target["image_id"].item())

                boxes = output["boxes"].cpu().numpy()
                scores = output["scores"].cpu().numpy()
                labels = output["labels"].cpu().numpy()

                for box, score, label in zip(boxes, scores, labels):
                    x1, y1, x2, y2 = box
                    coco_results.append({
                        "image_id": image_id,
                        "category_id": int(label),
                        "bbox": [x1, y1, x2 - x1, y2 - y1],
                        "score": float(score)
                    })

    coco_dt = coco_gt.loadRes(coco_results)
    coco_eval = COCOeval(coco_gt, coco_dt, iouType="bbox")
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()

    stats = coco_eval.stats
    return {
        "mAP": stats[0],   # AP@[0.5:0.95]
        "AP50": stats[1],  # AP@0.5
        "AP75": stats[2]   # AP@0.75
    }


In [None]:
val_dataset = COCODetectionDataset(
    images_dir="../dashcam 2.v1i.coco/valid",
    annotation_file="../dashcam 2.v1i.coco/valid/_annotations.coco.json",
    transforms=get_transform()
)

val_loader = DataLoader(
    val_dataset,
    batch_size=2,
    shuffle=False,      # ❗ important
    collate_fn=collate_fn
)

stats = evaluate_coco(
    model,
    val_loader,
    "../dashcam 2.v1i.coco/valid/_annotations.coco.json"
)

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.27s).
Accumulating evaluation results...
DONE (t=0.08s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.001
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.004
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.009
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets