In [1]:
import os
import json

import pandas as pd

import torch
import torchvision  
from torchvision.io import read_image
from torchvision import tv_tensors
from torchvision.transforms.v2 import functional as F
from torchvision.transforms import v2 as T
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

from loguru import logger


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

def get_fasterrcnn_model(num_classes):
    # load a model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(
        weights="DEFAULT"
    )

    # replace the classifier with a new one, that has
    # num_classes which is user-defined
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(
        in_features, num_classes
    )

    return model


### Build dataset

In [15]:
# Define the label mapping
label_mapping = {
    "person": 1,
    "car": 2,
    "bicycle": 3,
    "motorcycle": 4,
    "bus": 5,
    "truck": 6
}

# Function to convert string labels to numerical labels
def convert_labels_to_numbers(labels):
    return [label_mapping[label] for label in labels]

In [13]:
# Define the dataset class
class MyDetectionDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, transforms=None, tag="train"):
        self.transforms = transforms
        self.tag = tag
        df = pd.read_csv(csv_file)
        df_selected = df[df["tag"] == tag]
        self.img_files = df_selected["imagepath"].values
        self.annot_files = df_selected["labelpath"].values

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        img_path = self.img_files[idx]
        mask_path = self.annot_files[idx]
        img = read_image(img_path)
        with open(mask_path, "rt") as f:
            d_json = json.load(f)
        # example:
        # [{"label": "person", "x": 167, "y": 162, "width": 310, "height": 465}]
        num_objs = len(d_json)

        labels = convert_labels_to_numbers(
            [obj["label"] for obj in d_json]
        )
        boxes_xywh = [
            (obj["x"], obj["y"], obj["width"], obj["height"])
            for obj in d_json
            if (obj["width"] > 0) and (obj["height"] > 0)
        ]
        boxes_xywh = torch.tensor(boxes_xywh, dtype=torch.float32)
        boxes = torchvision.ops.box_convert(
            boxes_xywh, in_fmt="xywh", out_fmt="xyxy"
        )
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        image_id = idx

        # Wrap sample and targets into torchvision tv_tensors:
        img = tv_tensors.Image(img)
        target = {
            "boxes": tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img)),
            "labels": torch.tensor(labels, dtype=torch.int64),
            "image_id": image_id,
            "area": area,
            "iscrowd": iscrowd
        }

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        if self.tag == "test":
            return img_path, img, target

        return img, target


def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
    return T.Compose(transforms)

In [5]:
import platform
os_type = platform.system()

os.makedirs("code/", exist_ok=True)

if os_type == "Linux":
    os.system("wget https://raw.githubusercontent.com/pytorch/vision/master/references/detection/engine.py -P code/")
    os.system("wget https://raw.githubusercontent.com/pytorch/vision/master/references/detection/utils.py -P code/")
    os.system("wget https://raw.githubusercontent.com/pytorch/vision/master/references/detection/transforms.py -P code/")
elif os_type == "Darwin":
    os.system("curl https://raw.githubusercontent.com/pytorch/vision/master/references/detection/engine.py -o code/engine.py")
    os.system("curl https://raw.githubusercontent.com/pytorch/vision/master/references/detection/utils.py -o code/utils.py")
    os.system("curl https://raw.githubusercontent.com/pytorch/vision/master/references/detection/transforms.py -o code/transforms.py")
else:
    logger.error("OS not supported")


--2024-10-01 22:01:14--  https://raw.githubusercontent.com/pytorch/vision/master/references/detection/engine.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8000::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4063 (4.0K) [text/plain]
Saving to: ‘code/engine.py.4’

     0K ...                                                   100% 1017K=0.004s

2024-10-01 22:01:14 (1017 KB/s) - ‘code/engine.py.4’ saved [4063/4063]

--2024-10-01 22:01:14--  https://raw.githubusercontent.com/pytorch/vision/master/references/detection/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8003::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting res

In [6]:
import sys
import pprint
sys.path.append("code/")
import utils


dataset = MyDetectionDataset(
    "data/data.csv",
    transforms=get_transform(train=True),
    tag="train"
)
data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    num_workers=4,
    collate_fn=utils.collate_fn
)

num_classes = len(label_mapping) + 1  # 0: background, 1...N: classes
model = get_fasterrcnn_model(num_classes=num_classes)


# For Training
images, targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images, targets)  # Returns losses and detections
print(pprint.pformat(output))


# For inference
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)  # Returns predictions

print(pprint.pformat(predictions[0]))

{'loss_box_reg': tensor(0.5500, grad_fn=<DivBackward0>),
 'loss_classifier': tensor(2.1534, grad_fn=<NllLossBackward0>),
 'loss_objectness': tensor(0.0490, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0141, grad_fn=<DivBackward0>)}
{'boxes': tensor([[  0.0000,  41.0168, 303.5200, 279.0636],
        [195.5486,  17.0852, 392.7163,  80.6631],
        [105.8574,   0.0000, 324.4152, 237.7843],
        [ 12.5271,   0.0000, 228.5193, 228.2327],
        [ 70.7924,  71.8132,  74.6952,  75.8360],
        [101.4217,  15.9934, 383.3571, 151.5665],
        [193.9711,  36.6921, 393.4330, 118.6016],
        [136.2682,  71.7459, 342.9651, 212.3316],
        [ 62.2193,  33.5139, 283.3537, 171.8978],
        [ 40.3806,  90.6082, 266.2570, 233.5205],
        [163.4564,  89.9357, 168.0202,  93.0731],
        [ 13.0182,   0.0000, 383.7056,  99.9642],
        [161.4269,  90.5458, 166.7726,  93.7259],
        [229.1256, 183.8250, 232.5662, 188.5283],
        [177.2903,  46.

###  Train the model

In [7]:
from detection_tools.engine import train_one_epoch, evaluate

os.makedirs("checkpoints", exist_ok=True)

# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

dataset_train = MyDetectionDataset(
    "data/data.csv",
    transforms=get_transform(train=True),
    tag="train"
)
dataset_val = MyDetectionDataset(
    "data/data.csv",
    transforms=get_transform(train=False),
    tag="val"
)

# define training and validation data loaders
train_loader = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=4,
    shuffle=True,
    num_workers=4,
    collate_fn=utils.collate_fn
)

val_loader = torch.utils.data.DataLoader(
    dataset_val,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    collate_fn=utils.collate_fn
)

# build the model
num_classes = len(label_mapping) + 1  # 0: background, 1...N: classes
model = get_fasterrcnn_model(num_classes)
# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

# let's train it just for 2 epochs
num_epochs = 2

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(
        model, optimizer, data_loader,
        device, epoch, print_freq=100
    )
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, val_loader, device=device)

# Save the model
torch.save(
    model.state_dict(),
    os.path.join("checkpoints", "model.pth")
)

Epoch: [0]  [   0/4000]  eta: 1:18:21  lr: 0.000010  loss: 2.6749 (2.6749)  loss_classifier: 1.9354 (1.9354)  loss_box_reg: 0.6395 (0.6395)  loss_objectness: 0.0761 (0.0761)  loss_rpn_box_reg: 0.0239 (0.0239)  time: 1.1754  data: 0.2604  max mem: 2352
Epoch: [0]  [ 100/4000]  eta: 0:09:25  lr: 0.000509  loss: 0.8437 (1.2017)  loss_classifier: 0.2518 (0.6709)  loss_box_reg: 0.4521 (0.4715)  loss_objectness: 0.0164 (0.0285)  loss_rpn_box_reg: 0.0236 (0.0307)  time: 0.1388  data: 0.0024  max mem: 3931
Epoch: [0]  [ 200/4000]  eta: 0:08:48  lr: 0.001009  loss: 0.4420 (0.8670)  loss_classifier: 0.1530 (0.4214)  loss_box_reg: 0.2583 (0.3863)  loss_objectness: 0.0106 (0.0262)  loss_rpn_box_reg: 0.0104 (0.0331)  time: 0.1301  data: 0.0023  max mem: 3931
Epoch: [0]  [ 300/4000]  eta: 0:08:24  lr: 0.001508  loss: 0.5643 (0.7414)  loss_classifier: 0.1546 (0.3332)  loss_box_reg: 0.3036 (0.3455)  loss_objectness: 0.0133 (0.0270)  loss_rpn_box_reg: 0.0178 (0.0357)  time: 0.1357  data: 0.0023  max me

### Evaluation on test-set

In [11]:
# Load the model
model = get_fasterrcnn_model(num_classes)
model.load_state_dict(
    torch.load(os.path.join("checkpoints", "model.pth"))
)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [14]:
import json
from tqdm import tqdm

dataset_test = MyDetectionDataset(
    "data/data.csv",
    transforms=get_transform(train=False),
    tag="test"
)


res_dir = "results/faster_rcnn"
os.makedirs(res_dir, exist_ok=True)

for i, (img_path, img, target) in tqdm(enumerate(dataset_test)):
    model.eval()
    with torch.no_grad():
        prediction = model([img.to(device)])
    # Apply NMS:
    # note that predicted boxes are in xyxy format
    # (as expected for NMS)
    keep = torchvision.ops.nms(
        prediction[0]["boxes"],
        prediction[0]["scores"],
        iou_threshold=0.5
    )
    prediction[0]["boxes"] = prediction[0]["boxes"][keep]
    prediction[0]["labels"] = prediction[0]["labels"][keep]
    prediction[0]["scores"] = prediction[0]["scores"][keep]

    # save prediction as json
    filename = os.path.basename(img_path)
    with open(os.path.join(res_dir, filename.replace(".jpg", ".json")), "wt") as f:
        res = {key: val.cpu().numpy().tolist() for key, val in prediction[0].items()}
        res["image_path"] = img_path
        json.dump(res, f)


2968it [01:22, 36.06it/s]


### Measure performance

In [16]:
import fiftyone as fo
from PIL import Image

ds_name = "faster_rcnn_predictions"
if ds_name in fo.list_datasets():
    fo.delete_dataset(ds_name)

dataset = fo.Dataset(name=ds_name)

df = pd.read_csv("data/data.csv")
df_test = df[df["tag"] == "test"]

for _, row in df_test.iterrows():
    sample = fo.Sample(filepath=row["imagepath"])

    filename = os.path.basename(row["imagepath"])
    img = Image.open(row["imagepath"])
    img_width, img_height = img.size

    gt_file = row["labelpath"]
    with open(gt_file, "rt") as f:
        gt = json.load(f)

    # Add ground truth
    list_gt = []
    for obj in gt:
        label = obj["label"]
        box = [
            obj["x"] / img_width, obj["y"] / img_height,
            obj["width"] / img_width, obj["height"] / img_height
        ]
        detection = fo.Detection(
            label=label,
            bounding_box=box
        )
        list_gt.append(detection)
    sample["ground_truth"] = fo.Detections(detections=list_gt)

    # Add predictions
    pred_file = os.path.join(res_dir, filename.replace(".jpg", ".json"))
    pred = json.load(open(pred_file, "rt"))

    labels = pred["labels"]
    boxes_xyxy = pred["boxes"]
    scores = pred["scores"]
    # convert box format from xyxy to xywh
    boxes_xywh = torchvision.ops.box_convert(
        torch.tensor(boxes_xyxy),
        in_fmt="xyxy",
        out_fmt="xywh"
    )

    list_detections = []
    for label, box, score in zip(labels, boxes_xywh, scores):
        box = [
            box[0] / img_width, box[1] / img_height,
            box[2] / img_width, box[3] / img_height
        ]
        label = list(label_mapping.keys())[list(label_mapping.values()).index(label)]
        detection = fo.Detection(label=label, bounding_box=box, confidence=score)
        list_detections.append(detection)

    sample["faster_rcnn"] = fo.Detections(detections=list_detections)
    dataset.add_sample(sample)

dataset.stats()

{'samples_count': 2968,
 'samples_bytes': 19199380,
 'samples_size': '18.3MB',
 'total_bytes': 19199380,
 'total_size': '18.3MB'}

In [17]:
#import fiftyone as fo
#ds_name = "faster_rcnn_predictions"
#dataset = fo.load_dataset(ds_name)


from fiftyone import ViewField as F

# Only contains detections with confidence >= 0.75
high_conf_view = dataset.filter_labels(
    "faster_rcnn", F("confidence") > 0.75, only_matches=False
)

session = fo.launch_app(high_conf_view, auto=False)
session

Session launched. Run `session.show()` to open the App in a cell output.


Dataset:          faster_rcnn_predictions
Media type:       image
Num samples:      2968
Selected samples: 0
Selected labels:  0
Session URL:      http://localhost:5151/
View stages:
    1. FilterLabels(field='faster_rcnn', filter={'$gt': ['$$this.confidence', 0.75]}, only_matches=False, trajectories=False)

In [23]:


results = high_conf_view.evaluate_detections(
    "faster_rcnn",
    gt_field="ground_truth",
    eval_key="eval",
    compute_mAP=True,
)

Evaluating detections...
 100% |███████████████| 2968/2968 [16.5s elapsed, 0s remaining, 206.8 samples/s]      
Performing IoU sweep...
 100% |███████████████| 2968/2968 [10.0s elapsed, 0s remaining, 331.2 samples/s]      


In [26]:
# Get the 10 most common classes in the dataset
if len(label_mapping) > 10:
    counts = dataset.count_values("ground_truth.detections.label")
    classes_top10 = sorted(counts, key=counts.get, reverse=True)[:10]
else:
    classes_top10 = list(label_mapping.keys())
    
# Print a classification report for the top-10 classes
results.print_report(classes=classes_top10)

              precision    recall  f1-score   support

      person       0.68      0.67      0.67     11004
         car       0.75      0.40      0.52      1932
     bicycle       0.73      0.19      0.31       316
  motorcycle       0.95      0.19      0.31       371
         bus       0.75      0.55      0.63       285
       truck       0.69      0.12      0.21       415

   micro avg       0.68      0.60      0.64     14323
   macro avg       0.76      0.35      0.44     14323
weighted avg       0.70      0.60      0.62     14323

