In [1]:
#Import necessary libraries
import numpy as np 
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Tuple, List, Dict, Optional
from collections import OrderedDict
from os import listdir
from os.path import isfile, join

from torchvision import models
import torchvision.ops.boxes as bops
import torchvision.transforms as T
from torchvision.utils import draw_bounding_boxes
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.roi_heads import fastrcnn_loss
from torchvision.models.detection.rpn import concat_box_prediction_layers

import torch
import torch.nn as nn
from torch import Tensor
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

#Function that saves a plot of the validation and training data
def save_metric_plot(train_metric, val_metric):
    x = [i for i in range(len(train_metric))]
    plt.plot(x,train_metric,label="Train Loss")
    plt.plot(x,val_metric,label="Val Loss")
    plt.legend()
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Train/Val Loss")
    file_name = join("/kaggle/working", "Train_Val Loss")
    plt.savefig(file_name)
    #plt.clf()
    print("Train/Val loss plot saved.")

#Function that defines the transformations of the data 
def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    transforms.append(T.ConvertImageDtype(torch.float))
    return T.Compose(transforms)
    
#Create a class to convert image dataset into PyTorch Dataset class
#Need to define the three functions listed when creating a pytorch dataset: __init__, __getitem__, and __len__

#Defining a dataset class for your data is actually super efficient with memory as you can create it in a way that
#you are only reading the current batch images into memory as opposed to the entire dataset. This works best when combined with
#a pytorch dataloader

#This Dataset class is slightly different than others as it's for object detection and pytorch expects this "target" dictionary object that's 
#specifically for detection models, usually the __getitem__ class would be a bit simpler
class ObjectDetectionDataset(Dataset):

    def __init__(self, image_files, boxes, transform=None):
        self.image_files = image_files
        boxes = np.array(boxes)
        self.boxes = torch.as_tensor(boxes, dtype=torch.float32)
        self.transform = transform
    
    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        if self.transform:
            img = self.transform(img)
            
        labels = torch.ones((1,), dtype=torch.int64)
        image_id = torch.tensor([idx])
        area = (self.boxes[:,2] - self.boxes[:,0]) * (self.boxes[:,3] - self.boxes[:,1])
        iscrowd = torch.zeros((1,), dtype=torch.int64)
            
        target = {}
        target["boxes"] = self.boxes[idx].unsqueeze(0)
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
            
        return img, target
    
    def __len__(self):
        return len(self.image_files)
    
#The pytorch object detection models are a little quirky and don't provide any loss information when models are in eval mode 
#so this is a pretty simple utility that some kind person on the internet created, thanks jhso from stackoverflow.
#https://stackoverflow.com/questions/71288513/how-can-i-determine-validation-loss-for-faster-rcnn-pytorch

#The next function is also from jhso so the two of those combined will allow you to obtain the loss functions from your object detection models when 
#evaluating on test data
def eval_forward(model, images, targets):
    # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
    """
    Args:
        images (list[Tensor]): images to be processed
        targets (list[Dict[str, Tensor]]): ground-truth boxes present in the image (optional)
    Returns:
        result (list[BoxList] or dict[Tensor]): the output from the model.
            It returns list[BoxList] contains additional fields
            like `scores`, `labels` and `mask` (for Mask R-CNN models).
    """
    model.eval()

    original_image_sizes: List[Tuple[int, int]] = []
    for img in images:
        val = img.shape[-2:]
        assert len(val) == 2
        original_image_sizes.append((val[0], val[1]))

    images, targets = model.transform(images, targets)

    # Check for degenerate boxes
    # TODO: Move this to a function
    if targets is not None:
        for target_idx, target in enumerate(targets):
            boxes = target["boxes"]
            degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
            if degenerate_boxes.any():
                # print the first degenerate box
                bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
                degen_bb: List[float] = boxes[bb_idx].tolist()
                raise ValueError(
                    "All bounding boxes should have positive height and width."
                    f" Found invalid box {degen_bb} for target at index {target_idx}."
                )

    features = model.backbone(images.tensors)
    if isinstance(features, torch.Tensor):
        features = OrderedDict([("0", features)])
    model.rpn.training=True
    #model.roi_heads.training=True


    #####proposals, proposal_losses = model.rpn(images, features, targets)
    features_rpn = list(features.values())
    objectness, pred_bbox_deltas = model.rpn.head(features_rpn)
    anchors = model.rpn.anchor_generator(images, features_rpn)

    num_images = len(anchors)
    num_anchors_per_level_shape_tensors = [o[0].shape for o in objectness]
    num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors]
    objectness, pred_bbox_deltas = concat_box_prediction_layers(objectness, pred_bbox_deltas)
    # apply pred_bbox_deltas to anchors to obtain the decoded proposals
    # note that we detach the deltas because Faster R-CNN do not backprop through
    # the proposals
    proposals = model.rpn.box_coder.decode(pred_bbox_deltas.detach(), anchors)
    proposals = proposals.view(num_images, -1, 4)
    proposals, scores = model.rpn.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level)

    proposal_losses = {}
    assert targets is not None
    labels, matched_gt_boxes = model.rpn.assign_targets_to_anchors(anchors, targets)
    regression_targets = model.rpn.box_coder.encode(matched_gt_boxes, anchors)
    loss_objectness, loss_rpn_box_reg = model.rpn.compute_loss(
        objectness, pred_bbox_deltas, labels, regression_targets
    )
    proposal_losses = {
        "loss_objectness": loss_objectness,
        "loss_rpn_box_reg": loss_rpn_box_reg,
    }

    #####detections, detector_losses = model.roi_heads(features, proposals, images.image_sizes, targets)
    image_shapes = images.image_sizes
    proposals, matched_idxs, labels, regression_targets = model.roi_heads.select_training_samples(proposals, targets)
    box_features = model.roi_heads.box_roi_pool(features, proposals, image_shapes)
    box_features = model.roi_heads.box_head(box_features)
    class_logits, box_regression = model.roi_heads.box_predictor(box_features)

    result: List[Dict[str, torch.Tensor]] = []
    detector_losses = {}
    loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
    detector_losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
    boxes, scores, labels = model.roi_heads.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
    num_images = len(boxes)
    for i in range(num_images):
        result.append(
            {
                "boxes": boxes[i],
                "labels": labels[i],
                "scores": scores[i],
            }
        )
    detections = result
    detections = model.transform.postprocess(detections, images.image_sizes, original_image_sizes)  # type: ignore[operator]
    model.rpn.training=False
    model.roi_heads.training=False
    losses = {}
    losses.update(detector_losses)
    losses.update(proposal_losses)
    return losses, detections

def evaluate_loss(model, data_loader, device):
    val_loss = 0
    with torch.no_grad():
        for images, targets in data_loader:
          images = list(image.to(device) for image in images)
          targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
          loss_dict, detections = eval_forward(model, images, targets)
         
          losses = sum(loss for loss in loss_dict.values())

          val_loss += losses
          
    validation_loss = val_loss/ len(data_loader)    
    return validation_loss

#Honestly not entirely sure what this does but it's required so that the batches work with the object detection model. I think it may have to do 
#with pulling from the dataloader as a dictionary which is what the model wants as input 
def collate_fn(batch):
    return tuple(zip(*batch))

### Preparing input data ###

#Read bounding box labels in pandas dataframe
labels_df = pd.read_csv("../input/car-object-detection/data/train_solution_bounding_boxes (1).csv")
labels_lst = []

#For each row of the dataframe store the bounding box coordinates and add this list to a global list holding all the bb labels
for i in range(labels_df.shape[0]):
    tmp = labels_df.iloc[i,1:].tolist()
    labels_lst.append(tmp)
    
labels = np.array(labels_lst, dtype=np.float32)

#Not every image has an accompanying label so this goes through the list of all labels 
#and pulls the file names of all images that have labels
path = "../input/car-object-detection/data/training_images"
all_files = [join(path,f) for f in labels_df.iloc[:,0]]

#Split train set into train and validation set
train_files, val_files, train_labels, val_labels = train_test_split(all_files[:-59], labels[:-59], random_state=1, train_size = .80)

test_files = all_files[-59:]
test_labels = labels[-59:]
    
#Instantiate train and val datasets and dataloaders
train_dataset = ObjectDetectionDataset(train_files, train_labels, transform=get_transform(train=True))
val_dataset = ObjectDetectionDataset(val_files, val_labels, transform=get_transform(train=False))
test_dataset = ObjectDetectionDataset(test_files, test_labels, transform=get_transform(train=False))
train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=False, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=5, shuffle=False, collate_fn=collate_fn)
    
#Determine if running on GPU or CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#Load 
model_conv = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

#One class for the object (car) +  one class for the background
num_classes = 2 

#Not entirely sure of exactly how all this works, but since we are transfer learning this step is to essentially create a new head in order 
#to adjust for our specific dataset
in_features = model_conv.roi_heads.box_predictor.cls_score.in_features
model_conv.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

model_conv = model_conv.to(device)

params = [p for p in model_conv.parameters() if p.requires_grad]

optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

def train_model(model, opt, scheduler, train_loader, val_loader, epochs=100):
    print("Beginning Training")
    best_loss = np.inf
    all_train_losses = []
    all_val_losses = []
    
    epochs_without_improve = 0
    early_stop = False
    patience = 5 # If no improvement in validation after X straight epochs, early stop activates
    
    for epoch in range(epochs):
        train_losses = []
        train_losses_dict = []
        model.train()
        train_progress = tqdm(train_loader, desc="Epoch [%s/%s]:" % (epoch, epochs))
        
        for imgs, targets in train_progress:
            imgs = list(image.to(device) for image in imgs)
            targets = [{k: torch.tensor(v).to(device) for k, v in t.items()} for t in targets]
            opt.zero_grad()
            loss_dict = model(imgs, targets) #supposedly model computes loss automatically, not sure how
            losses = sum(loss for loss in loss_dict.values())
            loss_dict_append = {k: v.item() for k, v in loss_dict.items()}
            loss_value = losses.item()
        
            train_losses.append(loss_value)
            train_losses_dict.append(loss_dict_append)
            #Just a small note for my future self reading this, I spent a couple days debugging why the model wasn't training and it was because at
            #some point I accidentally deleted the line calling backward below. I had a good laugh once I realized.
            losses.backward() 
            opt.step()
            train_progress.set_description("Epoch [%s/%s] Loss: [%.4f]" % (epoch, epochs, loss_value))
        
        scheduler.step()
        train_losses_dict = pd.DataFrame(train_losses_dict)
        print("Epoch {}, lr: {:.6f}, loss: {:.6f}, loss_classifier: {:.6f}, loss_box: {:.6f}, loss_rpn_box: {:.6f}, loss_object: {:.6f}".format(
        epoch, optimizer.param_groups[0]['lr'], np.mean(train_losses),
            train_losses_dict['loss_classifier'].mean(),
            train_losses_dict['loss_box_reg'].mean(),
            train_losses_dict['loss_rpn_box_reg'].mean(),
            train_losses_dict['loss_objectness'].mean()
        ))
        
        all_train_losses.append(np.mean(train_losses))
          
        #Validate
        validation_loss  = evaluate_loss(model, val_loader, device=device)
        all_val_losses.append(validation_loss.cpu().numpy())
        
        if validation_loss < best_loss:
            print("Loss [%.4f] improved from [%.4f]." % (validation_loss, best_loss))
            print("Saving model to current directory.")
            torch.save(model.state_dict(), "/kaggle/working/best_model")
            best_loss = validation_loss
            epochs_without_improve = 0
        else:
            epochs_without_improve += 1
            print("Loss [%.4f] did not improve from [%.4f]." % (validation_loss, best_loss))
        print('-'*20)
        
        #Check for early stopping
        if epochs_without_improve == patience:
            print("Early Stopping because there was no improvement in " + str(patience) + " straight epochs.")
            early_stop = True
            save_metric_plot(all_train_losses, all_val_losses)
            break
    if not early_stop:
        save_metric_plot(all_train_losses, all_val_losses)

In [2]:
train_model(model_conv, optimizer, lr_scheduler, train_dataloader, val_dataloader)

In [40]:
#Display test images with predictions
classes = ["ignore", "car"]
model_conv.eval()
torch.cuda.empty_cache()

trans = T.Compose([T.ToTensor()])
toPIL = T.ToPILImage()

tst_img, _ = test_dataset[0]

with torch.no_grad():
    output = model_conv([tst_img.to(device)])
    pred = output[0]
    
#Only display predictions with scores over a certain threshold
fig = plt.figure(figsize=(14, 10))
plt.imshow(draw_bounding_boxes(img_int,
    pred['boxes'][pred['scores'] > 0.800],
    [classes[i] for i in pred['labels'][pred['scores'] > 0.800].tolist()], width=4
).permute(1, 2, 0))

In [39]:
#Display image from training dataset
sample = test_dataset[0]
img_int = torch.tensor(sample[0] * 255, dtype=torch.uint8)

#print(sample)

plt.imshow(draw_bounding_boxes(
    img_int, sample[1]['boxes'], [classes[i] for i in sample[1]['labels']], width=4
).permute(1, 2, 0))

In [48]:
#Evaluate test dataset
def eval_model(model, test_dataset):
    model.eval()
    torch.cuda.empty_cache()
    results = []

    for tst_im in test_dataset:
        ground_truth = tst_im[1]['boxes']
        ground_truth = ground_truth.to(device)

        with torch.no_grad():
            output = model([tst_im[0].to(device)])
            pred = output[0]


        valid_boxes = pred['boxes'][pred['scores'] > 0.800]

        curr_results = []
        for box in valid_boxes:
            box = box.unsqueeze(0)
            tmp = bops.box_iou(box, ground_truth)
            curr_results.append(tmp[0][0])

        results.append(max(curr_results).cpu().numpy())

    print("Average IoU: ", np.mean(results))