# Faster-RCNN

In [1]:
import torch
import torchvision
from torchvision.datasets import CocoDetection
from torchvision.transforms import v2



In [2]:
def get_weight_filename(epoch: int):
    return f"faster_rcnn_epoch{str(epoch).zfill(4)}.pth"

def build_model(num_classes: int, resume: str, device: str = "cpu"):
    # Instantiate model with default weights
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
    # Replace the prdiction head
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(model.roi_heads.box_predictor.cls_score.in_features, num_classes+1)
    # Load custom weights
    if resume:
        model.load_state_dict(torch.load(resume))
    return model.to(device)

def build_coco_dataset(root: str, annFile: str, transform):
    return CocoDetection(root=root, annFile=annFile, transform=transform)

def build_dataloader(dataset, batch_size: int, collate_fn=None):
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [3]:
n_epochs = 300
start_epoch = 260
num_classes = 1
lr = 1e-4
batch_size = 32
save_every = 10
if start_epoch and start_epoch > 0:
    resume = get_weight_filename(start_epoch)
else:
    resume = False
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Device", device)

Device cuda


```python
# Generate random data, 4 images and 11 bboxes per image with 4 coordinates
images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4)

# Add x0 to x1 and y0 to y1, so that x1 cannot be smaller than x0 and same for y1 and y0
boxes[:, :, 2:4] = boxes[:, :, 0:2] + boxes[:, :, 2:4]

# Generate random labels (COCO has 91 classes)
labels = torch.randint(1, 91, (4, 11))

images = list(image for image in images)
targets = []

for i in range(len(images)):
    d = {}
    d['boxes'] = boxes[i]
    d['labels'] = labels[i]
    targets.append(d)
output = model(images, targets)
```

# Training

In [4]:
# Instantiate model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT)

# Dataset
root_dir = "C:\\Users\\tilof\\PycharmProjects\\DeepLearningProjects\\DETR\\data\\spine"
train_data_dir = f"{root_dir}\\train2017"
train_annotation_file = f"{root_dir}\\annotations\\instances_train2017.json"

transforms = v2.Compose([
    v2.ToTensor(),
    v2.ToDtype(torch.float32),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def collate_fn(data):
    return data

train_dataset = CocoDetection(root=train_data_dir, annFile=train_annotation_file, transform=transforms)
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Training parameters
optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=3, gamma=0.1)
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(model.roi_heads.box_predictor.cls_score.in_features, num_classes+1)
if resume:
    model.load_state_dict(torch.load(resume))
model.to(device)
''



loading annotations into memory...
Done (t=0.05s)
creating index...
index created!


''

In [5]:
model.train()
print("Start training ...")

for epoch in range(start_epoch+1, n_epochs+1):
    epoch_loss = 0
    epoch_loss_classifier = 0
    epoch_loss_objectness = 0
    epoch_loss_rpn_box_reg = 0
    for data in train_data_loader:
        images = []
        targets = []
        for image, annotations in data:
            images.append(image.to(device))
            bboxes = list(map(lambda x: [x['bbox'][0], x['bbox'][1], x['bbox'][0]+x['bbox'][2], x['bbox'][1]+x['bbox'][3]], annotations))
            labels = list(map(lambda x: int(x['category_id'])+1, annotations))
            target = {}
            target["boxes"] = torch.tensor(bboxes).to(device)
            target["labels"] = torch.tensor(labels).to(device)
            targets.append(target)
        loss_dict = model(images, targets)
        loss_classifier = loss_dict['loss_classifier'].detach().cpu().numpy()
        loss_box_reg = loss_dict['loss_box_reg'].detach().cpu().numpy()
        loss_objectness = loss_dict['loss_objectness'].detach().cpu().numpy()
        loss_rpn_box_reg = loss_dict['loss_rpn_box_reg'].detach().cpu().numpy()
        loss = sum(v for v in loss_dict.values())
        loss_total = loss.detach().cpu().numpy()
        epoch_loss += loss_total
        epoch_loss_classifier += loss_classifier
        epoch_loss_objectness += loss_objectness
        epoch_loss_rpn_box_reg += loss_rpn_box_reg
        optim.zero_grad()
        loss.backward()
        optim.step()
    print(f"Epoch {epoch}/{n_epochs}: loss={epoch_loss}, loss_classifier={epoch_loss_classifier}, loss_objectness={epoch_loss_objectness}, loss_rpn_box_reg={epoch_loss_rpn_box_reg}")
    if epoch > 0 and epoch % save_every == 0:
        torch.save(model.state_dict(), get_weight_filename(epoch))

Start training ...
Epoch 261/300: loss=2.5225654058158398, loss_classifier=0.9256681092083454, loss_objectness=0.04960546304937452, loss_rpn_box_reg=0.19005899969488382
Epoch 262/300: loss=2.3891897723078728, loss_classifier=0.9113056752830744, loss_objectness=0.05529198460862972, loss_rpn_box_reg=0.1851542186923325
Epoch 263/300: loss=2.3634716384112835, loss_classifier=0.9206626173108816, loss_objectness=0.0515775510284584, loss_rpn_box_reg=0.1854521429631859
Epoch 264/300: loss=2.313635878264904, loss_classifier=0.8984736483544111, loss_objectness=0.05289038963383064, loss_rpn_box_reg=0.1828618124127388
Epoch 265/300: loss=2.3127665668725967, loss_classifier=0.9095406178385019, loss_objectness=0.049054142844397575, loss_rpn_box_reg=0.18446182762272656
Epoch 266/300: loss=2.2980101443827152, loss_classifier=0.9081164114177227, loss_objectness=0.05372771347174421, loss_rpn_box_reg=0.1831319483462721
Epoch 267/300: loss=2.2739234678447247, loss_classifier=0.8927338849753141, loss_objec

In [6]:
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [7]:
for i in range(10):
    images, annotations = train_dataset[i]
    print(model([images.to(device)])[0]['labels'], len(annotations))
    print("Pred:", model([images.to(device)])[0]['boxes'])
    print("True:", list(map(lambda x: x['bbox'], annotations)))

tensor([1, 1, 1], device='cuda:0') 1
Pred: tensor([[ 18.7619, 128.9899,  40.0910, 145.1640],
        [ 33.0134, 195.1446,  49.3381, 210.1378],
        [511.8869, 215.2784, 511.9984, 227.7759]], device='cuda:0',
       grad_fn=<StackBackward0>)
True: [[19, 129, 21, 16]]
tensor([1, 1, 1], device='cuda:0') 1
Pred: tensor([[ 16.1982, 126.9820,  45.7223, 149.0213],
        [303.9715,   4.0852, 338.2688,  27.5187],
        [253.2170,   6.8253, 289.8279,  31.9003]], device='cuda:0',
       grad_fn=<StackBackward0>)
True: [[16, 127, 30, 22]]
tensor([1, 1, 1], device='cuda:0') 3
Pred: tensor([[301.9377,   3.0549, 338.2368,  28.9639],
        [254.8243,   5.8796, 292.1436,  34.9454],
        [279.0653,  52.9723, 293.8815,  74.1070]], device='cuda:0',
       grad_fn=<StackBackward0>)
True: [[255, 6, 37, 29], [279, 53, 15, 21], [302, 3, 36, 26]]
tensor([1, 1, 1, 1, 1, 1], device='cuda:0') 6
Pred: tensor([[301.9292,   3.2488, 338.2710,  28.7269],
        [254.9377,   6.0796, 292.0716,  35.0114],
  