## Dataset Directory Structure

    Parent_Directory (root)
          |
          |-----------Images (img_dir)
          |              |
          |              |------------------img1.jpg
          |              |------------------img2.jpg
          |              |------------------.........(and so on)
          |
          |
          |-----------train_labels.csv (anno_file)
          
          
## Annotation file format

           | Id         | Labels                                 |
           | img1.jpg   | x1 y1 x2 y2 label1 x1 y1 x2 y2 label2  |
           
- Labels:  xmin ymin xmax ymax label
- xmin, ymin - top left corner of bounding box
- xmax, ymax - bottom right corner of bounding box

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import cv2

In [9]:
from engine import train_one_epoch, evaluate
import utils
import os
import numpy as np
import torch
import pandas as pd
from PIL import Image
Image.LOAD_TRUNCATED_IMAGES = True
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [3]:
# Step 1 - Data pre-prep

In [4]:
root = "multi_object/kangaroo/kangaroo-master/";                        #var1
img_dir = "Images/";                                                    #var2
anno_file = "train_labels.csv";                                         #var3

In [5]:
train_list = pd.read_csv(root + anno_file);

In [6]:
label_list = [];
for i in range(len(train_list)):
    label = train_list["Label"][i];
    tmp = label.split(" ");
    for j in range(len(tmp)//5):
        if(tmp[(j*5+4)] not in label_list):
            label_list.append(tmp[(j*5+4)])
            
sorted(label_list)

['kangaroo']

In [None]:
# Step 3 - Data Loading

In [7]:
class CustomDatasetMultiObject(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.train_list = pd.read_csv(root + "/train_labels.csv");
        self.label_list = self.get_labels();
        self.num_classes = len(self.label_list) + 1;
        
    def get_labels(self):
        label_list = [];
        for i in range(len(self.train_list)):
            label = self.train_list["Label"][i];
            tmp = label.split(" ");
            for j in range(len(tmp)//5):
                if(tmp[(j*5+4)] not in label_list):
                    label_list.append(tmp[(j*5+4)])
        return sorted(label_list);
        

    def __getitem__(self, idx):
        # load images ad masks
        img_name = self.train_list["ID"][idx];
        label = self.train_list["Label"][idx];
        
        
        img_path = os.path.join(self.root, "Images",img_name)  
        img = Image.open(img_path).convert("RGB")
        h, w = img.size;
        tmp = label.split(" ");
        boxes = [];

        num_objs = 0;
        obj_ids = [];
        for j in range(len(tmp)//5):
            x1 = int(tmp[(j*5+0)]);
            y1 = int(tmp[(j*5+1)]);
            x2 = int(tmp[(j*5+2)]);
            y2 = int(tmp[(j*5+3)]);
            label = tmp[(j*5+4)];
            boxes.append([x1, y1, x2, y2]);
            obj_ids.append(self.label_list.index(label)+1);
            num_objs += 1;
        obj_ids = np.array(obj_ids, dtype=np.int64);
        #print(obj_ids)

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.as_tensor(obj_ids, dtype=torch.int64)
        
        #print(labels)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.train_list)

In [10]:
import transforms as T

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# use our dataset and defined transformations
dataset = CustomDatasetMultiObject(root, get_transform(train=True))
dataset_test = CustomDatasetMultiObject(root, get_transform(train=False))
num_classes = dataset.num_classes;

In [11]:
# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

In [12]:
# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=1, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=1, shuffle=False, num_workers=4,
    collate_fn=utils.collate_fn)

In [13]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

# load a pre-trained model for classification and return
# only the features
backbone = torchvision.models.mobilenet_v2(pretrained=True).features
# FasterRCNN needs to know the number of
# output channels in a backbone. For mobilenet_v2, it's 1280
# so we need to add it here
backbone.out_channels = 1280

# let's make the RPN generate 5 x 3 anchors per spatial
# location, with 5 different sizes and 3 different aspect
# ratios. We have a Tuple[Tuple[int]] because each feature
# map could potentially have different sizes and
# aspect ratios
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))

# let's define what are the feature maps that we will
# use to perform the region of interest cropping, as well as
# the size of the crop after rescaling.
# if your backbone returns a Tensor, featmap_names is expected to
# be [0]. More generally, the backbone should return an
# OrderedDict[Tensor], and in featmap_names you can choose which
# feature maps to use.
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                                output_size=7,
                                                sampling_ratio=2)

# put the pieces together inside a FasterRCNN model
model = FasterRCNN(backbone,
                   num_classes=num_classes,
                   rpn_anchor_generator=anchor_generator,
                   box_roi_pool=roi_pooler)

In [17]:
# get the model using our helper function
#model = get_model_instance_segmentation(num_classes)

# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
                            momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

# let's train it for 10 epochs
num_epochs = 2

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

print("That's it!")

Epoch: [0]  [  0/114]  eta: 0:03:28  lr: 0.000049  loss: 1.4710 (1.4710)  loss_classifier: 0.7212 (0.7212)  loss_box_reg: 0.0398 (0.0398)  loss_objectness: 0.6488 (0.6488)  loss_rpn_box_reg: 0.0611 (0.0611)  time: 1.8326  data: 0.4310  max mem: 1912
Epoch: [0]  [ 10/114]  eta: 0:00:38  lr: 0.000491  loss: 1.3886 (1.3089)  loss_classifier: 0.5579 (0.5122)  loss_box_reg: 0.0513 (0.0528)  loss_objectness: 0.6559 (0.6544)  loss_rpn_box_reg: 0.0694 (0.0895)  time: 0.3742  data: 0.0419  max mem: 2566
Epoch: [0]  [ 20/114]  eta: 0:00:28  lr: 0.000933  loss: 1.0678 (1.0848)  loss_classifier: 0.2570 (0.3554)  loss_box_reg: 0.0614 (0.0649)  loss_objectness: 0.6074 (0.5920)  loss_rpn_box_reg: 0.0450 (0.0725)  time: 0.2287  data: 0.0031  max mem: 2566
Epoch: [0]  [ 30/114]  eta: 0:00:23  lr: 0.001375  loss: 0.8066 (0.9565)  loss_classifier: 0.1772 (0.3153)  loss_box_reg: 0.0641 (0.0673)  loss_objectness: 0.4139 (0.5066)  loss_rpn_box_reg: 0.0420 (0.0673)  time: 0.2291  data: 0.0031  max mem: 2566


Test:  [49/50]  eta: 0:00:00  model_time: 0.0521 (0.0522)  evaluator_time: 0.0016 (0.0016)  time: 0.0580  data: 0.0027  max mem: 2574
Test: Total time: 0:00:03 (0.0607 s / it)
Averaged stats: model_time: 0.0521 (0.0522)  evaluator_time: 0.0016 (0.0016)
Accumulating evaluation results...
DONE (t=0.01s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.071
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.252
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.021
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.079
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.113
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.246
 Average Recall     (AR) @[ IoU=0.50:0