# Object Detection with Fast.ai, PyTorch and Icevision

Initial experiments training FasterRCNN (object detection) with Forensic Architecture's data. 

In [None]:
from icevision.all import *

In [None]:
data_dir = Path("data/Canisters_2020")
annotations_dir = data_dir / "Annotations"
images_dir = data_dir / "JPEGImages"
imagesets_dir = data_dir / "ImageSets/Main"

## Specify Imagesets for training and test

In [None]:
synth = [(Path(line.rstrip('\n')).name).split(".",1)[0] for line in open(imagesets_dir / "synth")]
train = [(Path(line.rstrip('\n')).name).split(".",1)[0] for line in open(imagesets_dir / "train")]
val = [(Path(line.rstrip('\n')).name).split(".",1)[0] for line in open(imagesets_dir / "val")]
test = [(Path(line.rstrip('\n')).name).split(".",1)[0] for line in open(imagesets_dir / "test")]

In [None]:
class_map = ClassMap(['canister',
                 'cylinder',
                 'can',
                 'bottle',
                 'bin'])

Prepare data with icevision voc parser:

In [None]:
parser = parsers.voc(
    annotations_dir=annotations_dir, images_dir=images_dir, class_map=class_map
)

In [None]:
# presplits = [synth, val, test]
presplits = [train, val, test]
data_splitter = FixedSplitter(presplits)

In [None]:
train_records, valid_records, test_records = parser.parse(data_splitter)

Check our data has been prepared correctly with corresponding labels:

In [None]:
show_records(train_records[:2], ncols=2, class_map=class_map)

In [None]:
# OPTIONAL: we can remove any bboxes at the edge of the image that don't have enough of our object
# in view. We judge this based on bbox area relative to objects fully in view and the 
# ratio of the bbox dimensions. For instance, with a max ratio of 1.6, any bboxes where either 
# width or height are over 1.6x the other will be removed. For min_area of 0.7, any bbox covering
# less than 70% area of the average full bbox will be removed. Although we should be careful as general 
# advice suggests we should leave objects partially in view!

# def remove_partials(records, max_ratio, min_area):
#     for record in records:
#         partial_bboxes = []
#         full_bboxes = []
#         for bbox in record.bboxes:
#             # if bbox is on the edge of the image, store within partial_bboxes
#             if bbox.xmin == 0 or bbox.ymin == 0 or bbox.xmax >= record.width-1 or bbox.ymax >= record.height-1:
#                 partial_bboxes.append(bbox)
#             else: 
#                 bbox_area = (bbox.xmax-bbox.xmin)*(bbox.ymax-bbox.ymin)
#                 full_bboxes.append(bbox_area)
#             # find the average area of a full box
#         if full_bboxes: 
#             mean_bbox_area = sum(full_bboxes)/len(full_bboxes)
#         for bbox in partial_bboxes:
#             dims = (bbox.xmax-bbox.xmin, bbox.ymax-bbox.ymin)
#             if max(dims)/min(dims) < max_ratio: 
#                 if full_bboxes:
#                     bbox_area = (bbox.xmax-bbox.xmin)*(bbox.ymax-bbox.ymin)
#                     if bbox_area/mean_bbox_area < min_area: 
#                         record.labels.pop(record.bboxes.index(bbox))
#                         record.bboxes.remove(bbox)
#                         print("remove:" + str(bbox))
#             else: 
#                 record.labels.pop(record.bboxes.index(bbox))
#                 record.bboxes.remove(bbox)
#                 print("remove:" + str(bbox))

# remove_partials(train_records, 1.4, 0.35)

Apply transforms to our dataset: 

In [None]:
size = 384

train_tfms = tfms.A.Adapter(
    [*tfms.A.aug_tfms(size=size, presize=None), tfms.A.Normalize(), tfms.A.HorizontalFlip(), tfms.A.Blur(blur_limit=(1, 10)), tfms.A.ShiftScaleRotate()]
)
valid_tfms = tfms.A.Adapter([*tfms.A.resize_and_pad(size=size), tfms.A.Normalize()])
test_tfms = tfms.A.Adapter([*tfms.A.resize_and_pad(size=size), tfms.A.Normalize()])

In [None]:
train_ds = Dataset(train_records, train_tfms)
valid_ds = Dataset(valid_records, valid_tfms)
test_ds = Dataset(test_records, test_tfms)

In [None]:
samples = [train_ds[11] for _ in range(10)]
show_samples(samples, denormalize_fn=denormalize_imagenet, ncols=5, display_label=False)

In [None]:
train_dl = faster_rcnn.train_dl(train_ds, batch_size=32, num_workers=0, shuffle=True)
valid_dl = faster_rcnn.valid_dl(test_ds, batch_size=32, num_workers=0, shuffle=False)

# train_dl = retinanet.train_dl(train_ds, batch_size=32, num_workers=0, shuffle=True)
# valid_dl = retinanet.valid_dl(valid_ds, batch_size=32, num_workers=0, shuffle=False)

# train_dl = efficientdet.train_dl(train_ds, batch_size=32, num_workers=0, shuffle=True)
# valid_dl = efficientdet.valid_dl(valid_ds, batch_size=32, num_workers=0, shuffle=False)

# batch, samples = first(train_dl)
# show_samples(
#     samples[0:6], ncols=2, denormalize_fn=denormalize_imagenet, display_label=False
# )

In [None]:
# model = efficientdet.model(
#     model_name="tf_efficientdet_lite0", num_classes=len(class_map), img_size=size
# )

# model = retinanet.model(num_classes=len(class_map))

model = faster_rcnn.model(len(class_map))

In [None]:
metrics = [COCOMetric(metric_type=COCOMetricType.bbox)]

In [None]:
# learn = efficientdet.fastai.learner(
#     dls=[train_dl, valid_dl], model=model, metrics=metrics, detection_threshold = 0.90)

# learn = retinanet.fastai.learner(
#     dls=[train_dl, valid_dl], model=model, metrics=metrics)

learn = faster_rcnn.fastai.learner(
    dls=[train_dl, valid_dl], model=model, metrics=metrics, detection_threshold = 0.90)

In [None]:
learn.freeze()
learn.lr_find()

In [None]:
learn.fine_tune(5, 3e-3)

In [None]:
learn.fine_tune(50, 1e-4, freeze_epochs=10)

In [None]:
# now we're no longer looking for that maximum gradient but somewhere well before the steep rise ie 8e-4
learn.freeze()
learn.lr_find()

In [None]:
# discriminative learning rates: 
learn.unfreeze()
learn.fit_one_cycle(10, lr_max=slice(1e-6, 1e-4))

In [None]:
# save model
torch.save(model.state_dict(),Path('models/fasterrcnn_3_0.386.pth'))

In [None]:
# load model and try infer ... 
state_dict = torch.load('models/fasterrcnn_3_0.386.pth', map_location=torch.device('cpu'))
model.load_state_dict(state_dict)

In [None]:
# load unlabelled images for inference from filesystem... 
import PIL
def image_from_file(path): 
    img = np.array(PIL.Image.open(path))
    return img[:,:,:3]

imgs = []
for file in os.listdir(infer_dir):
    img = image_from_file(str(infer_dir) + "/" + file)
    imgs.append(img)

infer_ds = Dataset.from_images(imgs, valid_tfms)

In [None]:
batch, samples = faster_rcnn.build_infer_batch(valid_ds)

In [None]:
preds = faster_rcnn.predict(model=model, batch=batch, detection_threshold= 0.3,)

In [None]:
# prepare groundtruth folder:
valid_ds[0]
for i in range(len(valid_ds)): 
    labels = []
    for j in range(len(valid_ds[i]['labels'])):
        label = valid_ds[i]['labels'][j]
        name = class_map.get_id(label)
        bbox = valid_ds[i]['bboxes'][j]
        line = str(name) + " " + str(bbox.xmin) + " " + str(bbox.ymin) + " " + str(bbox.xmax) + " " + str(bbox.ymax)
        labels.append(line)
    file = "groundtruths/" + val[i] + '.txt'
    with open(file, 'w') as f: 
        for item in labels:
            f.write("%s\n" % item)

In [None]:
# prepare detections folder:
for i in range(len(preds)): 
    detections = []
    for j in range(len(preds[i]['labels'])):
        label = preds[i]['labels'][j]
        name = class_map.get_id(label)
        confidence = preds[i]['scores'][j]
        bbox = preds[i]['bboxes'][j]
        line = str(name) + " " + str(confidence) + " " + str(bbox.xmin) + " " + str(bbox.ymin) + " " + str(bbox.xmax) + " " + str(bbox.ymax)
        detections.append(line)
    file = "detections/" + val[i] + '.txt'
    with open(file, 'w') as f: 
        for item in detections:
            f.write("%s\n" % item)

In [None]:
# fix filenames:
import os
for file in os.listdir('ml_evaluation/groundtruths'):
    file = 'ml_evaluation/groundtruths/' + file
    if "not_can" in file:
        os.rename(file, file.replace("_val",""))
    elif "val_" in file:
        os.rename(file, file.replace("val_","test_"))

In [None]:
# show predictions:
images = [sample["img"] for sample in samples]
show_preds(samples=images[0:10],
           preds=preds[0:10],
           class_map=class_map,
           denormalize_fn=denormalize_imagenet,
           ncols=5
          )

In [None]:
# compare with groundtruths:
show_records(valid_ds.records[0:10], ncols=5, class_map=class_map)