
# Assignment 3

This is a template notebook for Assignment 3.


## Install dependencies and initialization

In [None]:
# install dependencies: 
!pip install pyyaml==5.1 "pycocotools>=2.0.1"
# !pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.6/index.html
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu111/torch1.9/index.html

In [None]:
! module list

In [None]:
! pwd           # shows current directory
! ls            # shows all files in this directory
! nvidia-smi    # shows the specs and the current status of the allocated GPU

In [None]:
# import some common libraries
from sklearn.metrics import jaccard_score
from PIL import Image, ImageDraw
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import datetime
import random
import json
import cv2
import csv
import os
from math import floor

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import torch
print("Pytorch Version:", torch.__version__)

# import some common pytorch utilities
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn
import torch

# import some common detectron2 utilities
import detectron2
import detectron2.data.transforms
from detectron2 import model_zoo
from detectron2.config import get_cfg
from detectron2.structures import BoxMode
from detectron2.engine import DefaultTrainer
from detectron2.engine import DefaultPredictor
from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import ColorMode
from detectron2.utils.visualizer import Visualizer
from detectron2.data import build_detection_train_loader, build_detection_test_loader
from detectron2.data import MetadataCatalog, DatasetCatalog, DatasetMapper
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
setup_logger()

# common visualization libraries
# from google.colab.patches import cv2_imshow
import matplotlib.pyplot as plt

plt.rcParams["figure.dpi"] = 300
plt.rcParams.update({"font.size": 16})

In [None]:
# Make sure that GPU is available for your notebook. 
# Otherwise, you need to update the settungs in Runtime -> Change runtime type -> Hardware accelerator
print("Is Cuda Available?", "yes" if torch.cuda.is_available() else "no")

In [None]:
# You need to mount your google drive in order to load the data:
from google.colab import drive
drive.mount("/content/drive")
# Put all the corresponding data files in a data folder and put the data folder in a same directory with this notebook.
# Also create an output directory for your files such as the trained models and the output images.

In [None]:
# Define the location of current directory, which should contain data/train, data/test, and data/train.json.
# TODO: approx 1 line
BASE_DIR = "."
OUTPUT_DIR = "{}/output".format(BASE_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)

## Part 1: Object Detection

### Data Loader

In [None]:
"""
# This function should return a list of data samples in which each sample is a dictionary. 
# Make sure to select the correct bbox_mode for the data
# For the test data, you only have access to the images, therefore, the annotations should be empty.
# Other values could be obtained from the image files.
# TODO: approx 35 lines
"""
def get_detection_data(set_name):
    data_dirs = "{}/data".format(BASE_DIR)
    dataset, val_set = [], []

    already_added = []      # holds the image files which have already been "processed" or "added to the memory"

    if set_name == "train":
        with open(os.path.join(data_dirs, "train.json") , "r") as jsonFile:
            rawdataset = json.load(jsonFile)

            for idx, value in enumerate(rawdataset):
                dataItem = {}
                filename = os.path.join(data_dirs, "train", value["file_name"])
                width, height = Image.open(filename).size[:2]

                obj = {
                    "bbox": value["bbox"],
                    "bbox_mode": BoxMode.XYWH_ABS,
                    "segmentation": value["segmentation"],
                    "category_id": 0
                }

                if filename in already_added:
                    dict_idx = already_added.index(filename)
                    dataset[dict_idx]["annotations"].append(obj)
                else:
                    dataItem["file_name"] = filename
                    already_added.append(filename)
                    dataItem["image_id"] = idx
                    dataItem["height"] = height
                    dataItem["width"] = width
                    dataItem["annotations"] = [obj]
                    dataset.append(dataItem)
        
        # Train Validataion Split
        # Train val split is 80%-20%: 158 40
        # dataset_len = len(dataset)
        # train_len = int(0.8 * dataset_len)

        # dataset, val_set = dataset[:train_len], dataset[train_len:]

    elif set_name == "test":
        for idx, filename in enumerate(os.listdir(os.path.join(data_dirs, "test"))):
            dataItem = {}
            width, height = Image.open(os.path.join(data_dirs, "test", filename)).size[:2]

            dataItem["file_name"] = os.path.join(data_dirs, "test", filename)
            dataItem["image_id"] = idx
            dataItem["height"] = height
            dataItem["width"] = width
            dataItem["annotations"] = []

            dataset.append(dataItem)
    return dataset

In [None]:
# just to update the block dataset (Don't run unless necessary)
for d in ["train", "test", "val"]:
    DatasetCatalog.remove("data_detection_" + d)

In [None]:
for d in ["train", "test"]:
    DatasetCatalog.register("data_detection_" + d, lambda d=d: get_detection_data(d)[0])
    MetadataCatalog.get("data_detection_" + d).set(thing_classes=["plane"])
    if d == "train":
        DatasetCatalog.register("data_detection_val", lambda d=d: get_detection_data(d)[1])
        MetadataCatalog.get("data_detection_val").set(thing_classes=["plane"])

full_train_meta = MetadataCatalog.get("data_detection_train")
full_val_meta = MetadataCatalog.get("data_detection_val")
full_test_meta = MetadataCatalog.get("data_detection_test")

In [None]:
print(len(DatasetCatalog.get("data_detection_val")))

### Making Blocked dataset

In [None]:
cropped_width, cropped_height = 512, 512

def crop_image(image, width_blk, new_width, height_blk, new_height):
    # print(height_blk, height_blk*new_height, (height_blk+1)*new_height, "\t", width_blk, width_blk*new_width, (width_blk+1)*new_width)
    return image[height_blk*new_height: (height_blk+1)*new_height, width_blk*new_width: (width_blk+1)*new_width]

def inside_bb(bbox, width_blk, new_width, height_blk, new_height):
    bbx, bby, bbw, bbh = bbox

    # print(bbx, bby, bbw, bbh, "\t", width_blk, new_width, height_blk, new_height)
    # the next conditions will skip all the images that are not inside the bounding box
    # if the current height and width are inside the first point

    after_firstx = (bbx >= width_blk*new_width)
    before_secondx = (bbx <= (width_blk+1)*new_width)
    after_firsty = (bby >= height_blk*new_height)
    before_secondy = (bby <= (height_blk+1)*new_height)

    width_check_before = (bbx+bbw) >= (width_blk)*new_width
    height_check_before = (bby+bbh) >= (height_blk)*new_height

    width_check_after = (bbx+bbw) <= (width_blk+1)*new_width
    height_check_after = (bby+bbh) <= (height_blk+1)*new_height

    inside = (after_firstx or after_firsty) and \
        (width_check_before and height_check_before) and \
            (width_check_after and height_check_after)
    # this condition on the third line checks if the current height and width are inside the second point

    if inside:
        cropped_bbox = [bbx - width_blk*new_width, bby - height_blk*new_height, bbw, bbh]
    else:
        cropped_bbox = []
    return inside, cropped_bbox

In [None]:
"""
This Function creates the new images for the dataset in a new directory
as well as their respective json files (for test as well)
So the other function just has to read the json file and do nothing else
"""
def make_block_detection_dataset(data, set_name):
    new_dir = "{}/data_blocks".format(BASE_DIR)

    if not os.path.exists(os.path.join(new_dir, set_name)):
        os.makedirs(os.path.join(new_dir, set_name))

    dataset = []
    already_made, already_added = [], []

    if set_name == "train":
        image_id = 0
        for idx, value in enumerate(data):
            image = cv2.imread(value["file_name"])
            height, width = value["height"], value["width"]

            new_height = floor(height/(height//cropped_height))
            new_width = floor(width/(width//cropped_width))

            for width_blk in range(width//cropped_width):
                for height_blk in range(height//cropped_height):
                    dataItem = {}

                    cropped_annotations = []
                    cropped_img = crop_image(image, width_blk, new_width, height_blk, new_height)
                    # print(cropped_img.shape)
                    filename = "{}_{}_{}.png".format(value["file_name"].split("/")[-1].split(".")[0], width_blk, height_blk)

                    # print(value["annotations"])
                    for annotation in value["annotations"]:

                        is_inside, cropped_bbox = inside_bb(annotation["bbox"], width_blk, new_width, height_blk, new_height)
                        if is_inside:
                            if filename not in already_made:
                                cv2.imwrite(os.path.join(new_dir, set_name, filename), cropped_img)
                                already_made.append(filename)

                            cropped_masks = []
                            for mask_idx, mask_val in enumerate(annotation["segmentation"][0]):
                                if mask_idx%2 == 0:
                                    cropped_masks.append(mask_val - width_blk*new_width)
                                else:
                                    cropped_masks.append(mask_val - height_blk*new_height)

                            obj = {
                                "bbox": cropped_bbox,
                                "bbox_mode": BoxMode.XYWH_ABS,
                                "segmentation": [cropped_masks],
                                "category_id": 0
                            }
                            cropped_annotations.append(obj)
                            # print(filename, image_id, cropped_annotations)
                    if filename not in already_added and len(cropped_annotations) != 0:
                        dataItem["file_name"] = os.path.join(new_dir, set_name, filename)
                        already_added.append(filename)
                        dataItem["image_id"] = image_id
                        image_id += 1
                        dataItem["height"] = new_height
                        dataItem["width"] = new_width
                        dataItem["annotations"] = cropped_annotations
                        dataset.append(dataItem)

        json.dump(dataset, open(os.path.join(new_dir, set_name, "train.json"), "w"))

    elif set_name == "test":
        image_id = 0
        for idx, value in enumerate(data):
            image = cv2.imread(value["file_name"])
            height, width, _ = image.shape

            new_height = round(height/(height//cropped_height))
            new_width = round(width/(width//cropped_width))

            for width_blk in range(width//cropped_width):
                for height_blk in range(height//cropped_height):
                    dataItem = {}
                    cropped_img = crop_image(image, width_blk, new_width, height_blk, new_height)
                    new_filename = "{}_{}_{}.png".format(value["file_name"].split("/")[-1].split(".")[0], width_blk, height_blk)
                    cv2.imwrite(os.path.join(new_dir, set_name, new_filename), cropped_img)

                    dataItem["file_name"] = os.path.join(new_dir, set_name, new_filename)
                    dataItem["image_id"] = image_id
                    image_id += 1
                    dataItem["height"] = new_height
                    dataItem["width"] = new_width
                    dataItem["annotations"] = []

                    dataset.append(dataItem)
        json.dump(dataset, open(os.path.join(new_dir, set_name, "test.json"), "w"))
    return dataset

#### Visualizing the blocked dataset

In [None]:
train_detection_full_set = get_detection_data("train")
test_detection_full_set = get_detection_data("test")

In [None]:
sample = make_block_detection_dataset([train_detection_full_set[90]], set_name="train")
len(sample)

In [None]:
fig, axs = plt.subplots(nrows=len(sample)//6 + 1, ncols=6, figsize=(20, 15))
print(len(sample))

num = 0

for row in range(axs.shape[0]):
    for col in range(axs.shape[1]):
        if (num == len(sample)):
            break
        image = cv2.imread(sample[num]["file_name"])
        visualizer = Visualizer(image[:, :, ::-1], metadata=full_train_meta, scale=0.5)
        visualizer._default_font_size = 4
        out = visualizer.draw_dataset_dict(sample[num])
        axs[row, col].imshow(out.get_image()[:, :, ::-1])
        axs[row, col].set_xticks([])
        axs[row, col].set_yticks([])
        num += 1

In [None]:
train_meta_full = MetadataCatalog.get("data_detection_train")
# print(train_detection_full_set[90])
image = cv2.imread(train_detection_full_set[90]["file_name"])
visualizer = Visualizer(image[:, :, ::-1], metadata=train_meta_full, scale=0.5)
visualizer._default_font_size = 4
out = visualizer.draw_dataset_dict(train_detection_full_set[90])
plt.imshow(out.get_image()[:,:,::-1])
plt.xticks([])
plt.yticks([])

#### Generating the datasets

In [None]:
make_block_detection_dataset(train_detection_full_set, set_name="train")

In [None]:
make_block_detection_dataset(test_detection_full_set, set_name="test")

In [None]:
# just to update the block dataset (Don't run unless necessary)
for d in ["train", "test", "val"]:
    DatasetCatalog.remove("data_block_detection_" + d)

In [None]:
def get_block_detection_data(set_name="train"):
    new_dir = "{}/data_blocks".format(BASE_DIR)
    train_dataset, val_dataset = [], []
    if set_name == "train":
        dataset = []
        with open(os.path.join(new_dir, set_name, "train.json") , "r") as jsonFile:
            dataset = json.load(jsonFile)

        # Train Validataion Split
        # Train val split is 80%-20%
        dataset_len = len(dataset)
        train_len = int(0.8 * dataset_len)
        train_dataset, val_dataset = dataset[:train_len], dataset[train_len:]
    if set_name == "test":
        with open(os.path.join(new_dir, set_name, "test.json") , "r") as jsonFile:
            train_dataset = json.load(jsonFile)

    # train_dataset is also the test dataset in the case of test set and val_dataset would be empty for the testing phase
    return train_dataset, val_dataset

In [None]:
"""
# Remember to add your dataset to DatasetCatalog and MetadataCatalog
# Consdier "data_detection_train" and "data_detection_test" for registration
# You can also add an optional "data_detection_val" for your validation by spliting the training data
# TODO: approx 5 lines
"""
for d in ["train", "test"]:
    DatasetCatalog.register("data_block_detection_" + d, lambda d=d: get_block_detection_data(d)[0])
    MetadataCatalog.get("data_block_detection_" + d).set(thing_classes=["plane"])
    if d == "train":
        DatasetCatalog.register("data_block_detection_val", lambda d=d: get_block_detection_data(d)[1])
        MetadataCatalog.get("data_block_detection_val").set(thing_classes=["plane"])

block_train_meta = MetadataCatalog.get("data_block_detection_train")
block_val_meta = MetadataCatalog.get("data_block_detection_val")
block_test_meta = MetadataCatalog.get("data_block_detection_test")

In [None]:
"""
# Visualize some samples using Visualizer to make sure that the function works correctly
# TODO: approx 5 lines
"""
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20, 15))
fig.suptitle("Visualizing 6 random images (Val Set - Patches)", fontsize=22)

random_train = [item for item in random.sample(DatasetCatalog.get("data_block_detection_val"), 6)]
num = 0
for row in range(axs.shape[0]):
    for col in range(axs.shape[1]):
        image = cv2.imread(random_train[num]["file_name"])
        visualizer = Visualizer(image[:, :, ::-1], metadata=block_train_meta, scale=0.5)
        visualizer._default_font_size = 4
        out = visualizer.draw_dataset_dict(random_train[num])
        axs[row, col].imshow(out.get_image()[:, :, ::-1])
        axs[row, col].set_xticks([])
        axs[row, col].set_yticks([])
        num += 1

### Set Configs

In [None]:
"""
# Set the configs for the detection part in here.
# TODO: approx 15 lines
"""
cfg = get_cfg()
cfg.OUTPUT_DIR = "{}/output/".format(BASE_DIR)

# cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml"))
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))    # Improved
cfg.DATASETS.TRAIN = ("data_block_detection_train",)
cfg.DATASETS.TEST = ("data_block_detection_val",)
cfg.DATALOADER.NUM_WORKERS = 2
# cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml")      # Let training initialize from model zoo
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")  # Improved
cfg.SOLVER.IMS_PER_BATCH = 4
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.MAX_ITER = 300                                                                               # updated from 500
cfg.SOLVER.STEPS = []

cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1                                                                     # plane is the only class

### Training

In [None]:
"""
# Create a DefaultTrainer using the above config and train the model
# TODO: approx 5 lines
"""
class DetectionTrainer(DefaultTrainer):
    def train_loader(cfg):
        mapper = DatasetMapper(cfg, is_train=True, augmentations=[
            detectron2.data.transforms.resize((400, 400)),
            detectron2.data.transforms.RandomCrop("relative", (0.7, 0.7)),
            detectron2.data.transforms.RandomBrightness(0.3, 1.5),
            detectron2.data.transforms.RandomRotation([0, 360]),
            detectron2.data.RandomFlip(prob=0.33, horizontal=False, vertical=True),
            detectron2.data.RandomFlip(prob=0.33, horizontal=True, vertical=False),
            detectron2.data.Normalize(mean = [0.50867474, 0.5278123, 0.5297691],
                                      std = [0.22613746, 0.22742623, 0.2254186])
        ])
        return build_detection_train_loader(cfg, mapper=mapper)

In [None]:
trainer = DetectionTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

### Evaluation and Visualization

In [None]:
%load_ext tensorboard
%tensorboard --logdir=output

In [None]:
"""
# After training the model, you need to update cfg.MODEL.WEIGHTS
# Define a DefaultPredictor
"""
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "detection3000_model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6                             # 0.7 

predictor = DefaultPredictor(cfg)

In [None]:
def divide_into_blocks(data, image):
    datasamples = []

    width = data["width"]
    height = data["height"]

    new_height = floor(height/(height//cropped_height))
    new_width = floor(width/(width//cropped_width))

    for height_blk in range(height//cropped_height):
        horizontalsamples = []
        for width_blk in range(width//cropped_width):
            cropped_img = crop_image(image, width_blk, new_width, height_blk, new_height)
            horizontalsamples.append(cropped_img)
        datasamples.append(horizontalsamples)
    
    return datasamples

In [None]:
"""
# Visualize the output for 3 random test samples
# TODO: approx 10 lines
"""
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20, 15))
fig.suptitle("Visualizing 6 random prediction results", fontsize=22)

random_test = [item for item in random.sample(DatasetCatalog.get("data_detection_val"), 6)]
num = 0

for row in range(axs.shape[0]):
    for col in range(axs.shape[1]):
        image = cv2.imread(random_test[num]["file_name"])
        width, height = random_test[num]["width"], random_test[num]["height"]
        full_image = np.zeros(image.shape)
        divided_images = divide_into_blocks(random_test[num], image)

        for height_blk in range(len(divided_images)):
            for width_blk in range(len(divided_images[0])):
                img = divided_images[height_blk][width_blk]
                outputs = predictor(img)
                visualizer = Visualizer(img[:, :, ::-1],
                                    metadata=full_val_meta)    # scale divides the image (very troublesome)
                visualizer._default_font_size = 4
                out = visualizer.draw_instance_predictions(outputs["instances"].to("cpu"))
                out_img = out.get_image()[:, :, ::-1]
                block_height, block_width, _ = out_img.shape

                full_image[height_blk*block_height:(height_blk+1)*block_height, width_blk*block_width:(width_blk+1)*block_width] = out_img

        # normalizing the image so it does not get clipped by matplotlib
        full_image = (full_image - np.min(full_image)) / (np.max(full_image) - np.min(full_image))

        axs[row, col].imshow(full_image)
        axs[row, col].set_xticks([])
        axs[row, col].set_yticks([])
        num += 1

In [None]:
"""
# Use COCOEvaluator and build_detection_train_loader
# You can save the output predictions using inference_on_dataset
# TODO: approx 5 lines
"""

evaluator = COCOEvaluator("data_block_detection_val", output_dir=cfg.OUTPUT_DIR)
test_loader = build_detection_test_loader(cfg, "data_block_detection_val")
print(inference_on_dataset(trainer.model, test_loader, evaluator))

### Improvements

For this part, you can bring any improvement which you have by adding new input parameters to the previous functions or defining new functions and variables.


# Original Model
## Specifications

- Pretrained Model used: “faster_rcnn_R_101_FPN_3x.yml”
- MAX_ITER = 500
- BATCH_SIZE_PER_IMAGE = 512
- IMS_PER_BATCH = 2
- BASE_LR = 0.00025
- SCORE_THRESH_TEST = 0.7

## Training Losses
iter: 499  total_loss: 0.9014  loss_cls: 0.2035  loss_box_reg: 0.2971  loss_rpn_cls: 0.1164  loss_rpn_loc: 0.2317  time: 0.5699  data_time: 0.4110  lr: 0.0002495  max_mem: 7138M

## Evaluation (Validation Set)
|   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
|:------:|:------:|:------:|:------:|:------:|:------:|
| 25.532 | 50.884 | 23.175 | 22.917 | 40.961 | 50.750 |

# Improved Model
## Specicifications

- Pretrained Model used: “faster_rcnn_X_101_32x8d_FPN_3x.yml”
- MAX_ITER = 1500
- BASE_LR = 0.00025
- SCORE_THRESH_TEST = 0.6

- Augmentations:
    - Resize
    - RandomCrop
    - RandomBrightness
    - RandomFlip (Horizontal)
    - RandomFlip (Vertical)
    - RandomRotation

## Training Losses
iter: 1499  total_loss: 0.4923  loss_cls: 0.09955  loss_box_reg: 0.2605  loss_rpn_cls: 0.01884  loss_rpn_loc: 0.09092  time: 0.9274  data_time: 0.0741  lr: 0.00025  max_mem: 9701M

## Evaluation (Validation Set)
|   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
|:------:|:------:|:------:|:------:|:------:|:------:|
| 39.029 | 66.136 | 42.902 | 34.872 | 57.326 | 67.355 |

# Images in Patches
## Training Loss
iter: 1999  total_loss: 0.3173  loss_cls: 0.09003  loss_box_reg: 0.1403  loss_rpn_cls: 0.01165  loss_rpn_loc: 0.03911  time: 0.7538  data_time: 0.0028  lr: 0.00025  max_mem: 7480M
## Evaluation (Validation Set)
### 2000 iterations
|   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
|:------:|:------:|:------:|:------:|:------:|:------:|
| 58.188 | 86.462 | 69.895 | 63.066 | 56.444 | 35.380 |

### 3000 iterations (4 batch size)
#### This Model
|   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
|:------:|:------:|:------:|:------:|:------:|:------:|
| 61.261 | 86.990 | 72.992 | 65.188 | 58.283 | 40.558 |
##### 2nd run
|   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
|:------:|:------:|:------:|:------:|:------:|:------:|
| 60.855 | 86.469 | 71.730 | 65.125 | 60.491 | 40.641 |
#### Mask RCNN (4 batch size)
|   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
|:------:|:------:|:------:|:------:|:------:|:------:|
| 60.377 | 86.210 | 72.843 | 65.647 | 57.667 | 37.212 |

### 5000 iterations (4 batch size) - Overfit
#### This Model
|   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
|:------:|:------:|:------:|:------:|:------:|:------:|
| 61.268 | 86.323 | 72.166 | 64.300 | 60.850 | 35.902 |
#### Mask RCNN (2 batch size)
|   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
|:------:|:------:|:------:|:------:|:------:|:------:|
| 61.517 | 85.989 | 73.176 | 66.696 | 59.688 | 31.893 |

## Part 2: Semantic Segmentation

### Data Loader

In [None]:
"""
# Write a function that returns the cropped image and corresponding mask regarding the target bounding box
# idx is the index of the target bbox in the data
# high-resolution image could be passed or could be load from data["file_name"]
# You can use the mask attribute of detectron2.utils.visualizer.GenericMask 
# to convert the segmentation annotations to binary masks
# TODO: approx 10 lines
"""
from detectron2.utils.visualizer import GenericMask

def get_instance_sample(data, idx, img=None):
    width = data["width"]
    height = data["height"]

    mask = data["annotations"][idx]["segmentation"]
    bbx, bby, bbw, bbh = data["annotations"][idx]["bbox"]

    if img is None:
        img = cv2.imread(data["file_name"])

    obj_img = img[round(bby):round(bby + bbh), round(bbx):round(bbx + bbw)]

    obj_mask = np.array(GenericMask(mask, height, width).mask)
    obj_mask = obj_mask[round(bby):round(bby + bbh), round(bbx):round(bbx + bbw)]

    return obj_img, obj_mask

In [None]:
"""
# We have provided a template data loader for your segmentation training
# You need to complete the __getitem__() function before running the code
# You may also need to add data augmentation or normalization in here
"""

class PlaneDataset(Dataset):
    def __init__(self, set_name, data_list):
        self.transforms = transforms.Compose([
            transforms.ToTensor(), # Converting the image to tensor and change the image format (Channels-Last => Channels-First)
            transforms.Normalize(mean=[0.50867474, 0.5278123, 0.5297691],
                                 std=[0.22613746, 0.22742623, 0.2254186])
        ])
        self.set_name = set_name
        self.data = data_list
        self.instance_map = []
        for i, d in enumerate(self.data):
            for j in range(len(d["annotations"])):
                self.instance_map.append([i,j])

    """
    # you can change the value of length to a small number like 10 for debugging of your training procedure and overfeating
    # make sure to use the correct length for the final training
    """
    def __len__(self):
        return len(self.instance_map)

    def numpy_to_tensor(self, img, mask):
        if self.transforms is not None:
            img = self.transforms(img)

        if not torch.is_tensor(img):
            img = torch.tensor(img, dtype=torch.float)

        mask = torch.tensor(mask, dtype=torch.float)
        return img, mask.unsqueeze(0)

    """
    # Complete this part by using get_instance_sample function
    # make sure to resize the img and mask to a fixed size (for example 128*128)
    # you can use "interpolate" function of pytorch or "numpy.resize"
    # TODO: 5 lines
    """
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        idx = self.instance_map[idx]
        data = self.data[idx[0]]

        img, mask = get_instance_sample(data, idx[1])

        img = cv2.resize(img, dsize=(128, 128), interpolation=cv2.INTER_NEAREST)
        mask = cv2.resize(mask, dsize=(128, 128), interpolation=cv2.INTER_NEAREST)
        img, mask = self.numpy_to_tensor(img, mask)

        return img, mask

def get_plane_dataset(set_name="train", batch_size=2):
    my_data_list = DatasetCatalog.get("data_detection_{}".format(set_name))
    dataset = PlaneDataset(set_name, my_data_list)
    loader = DataLoader(dataset, batch_size=batch_size, num_workers=4,
                        pin_memory=False, shuffle=True)
    return loader, dataset

In [None]:
_, data_train = get_plane_dataset(set_name="train")
_, data_val = get_plane_dataset(set_name="val")
# loader_test, _ = get_plane_dataset(set_name="test")

whole_dataset = torch.utils.data.ConcatDataset([data_train, data_val])

imgs = [item[0] for item in whole_dataset]      # item[0] and item[1] are image and its mask
imgs = torch.stack(imgs, dim=0).numpy()

# calculate mean over each channel (r,g,b)
print("Mean for RGB images:\t\t\t", imgs[:,0,:,:].mean(), imgs[:,1,:,:].mean(), imgs[:,2,:,:].mean())

# calculate std over each channel (r,g,b)
print("Standard Deviation for RGB images:\t", imgs[:,0,:,:].std(), imgs[:,1,:,:].std(), imgs[:,2,:,:].std())


"""
Mean for RGB images:			     0.50867474 0.5278123 0.5297691
Standard Deviation for RGB images:	 0.22613746 0.22742623 0.2254186
"""

#### Testing the loaded data

In [None]:
loader, dataset = get_plane_dataset(set_name="train", batch_size=2)
print("Shape of image and its mask (for an image):", dataset[0][0].shape, dataset[0][1].shape)

vis_img, vis_mask = np.transpose(dataset[0][0].numpy(), [1, 2, 0]), dataset[0][1].squeeze(0).numpy()

fig, axs = plt.subplots(nrows=1, ncols=2)
axs[0].imshow(vis_img)
axs[1].imshow(vis_mask)

axs[0].set_xticks([])
axs[0].set_yticks([])
axs[1].set_xticks([])
axs[1].set_yticks([])

plt.show()

### Network

In [None]:
"""
# convolution module as a template layer consists of conv2d layer, batch normalization, and relu activation
"""
class conv(nn.Module):
    def __init__(self, in_ch, out_ch, int_ch=None, activation=True):
        super(conv, self).__init__()
        if not int_ch:
            int_ch = out_ch
        if(activation):
            self.layer = nn.Sequential(
                nn.Conv2d(in_ch, int_ch, 3, padding=1),
                nn.BatchNorm2d(int_ch),
                nn.ReLU(inplace=True),
                nn.Conv2d(int_ch, out_ch, 3, padding=1),
                nn.BatchNorm2d(out_ch),
                nn.ReLU(inplace=True)
            )
        else:
            self.layer = nn.Sequential(
                nn.Conv2d(in_ch, int_ch, 3, padding=1),
                nn.BatchNorm2d(int_ch),
                nn.ReLU(inplace=True),
                nn.Conv2d(int_ch, out_ch, 3, padding=1),
            )

    def forward(self, x):
        x = self.layer(x)
        return x

"""
# downsampling module equal to a conv module followed by a max-pool layer
"""
class down(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(down, self).__init__()
        self.layer = nn.Sequential(
            nn.MaxPool2d(2),
            conv(in_ch, out_ch)
        )

    def forward(self, x):
        x = self.layer(x)
        return x

"""
# upsampling module equal to a upsample function followed by a conv module
"""
class up(nn.Module):
    def __init__(self, in_ch, out_ch, bilinear=False):
        super(up, self).__init__()
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
            self.conv = conv(in_ch, out_ch, in_ch//2)
        else:
            self.up = nn.ConvTranspose2d(in_ch, in_ch//2, 2, stride=2)
            self.conv = conv(in_ch, out_ch)

    def forward(self, x1, x2):
        y1 = self.up(x1)
        shapey = x2.size()[2] - y1.size()[2]
        shapex = x2.size()[3] - y1.size()[3]

        y1 = F.pad(y1, [shapex//2, shapex-shapex//2, shapey//2, shapey-shapey//2])
        y = self.conv(torch.cat([x2, y1], dim=1))
        return y

"""
# the main model which you need to complete by using above modules.
# you can also modify the above modules in order to improve your results.
"""
class MyModel(nn.Module):
    def __init__(self, bilinear=False):
        super(MyModel, self).__init__()
        
        # Encoder
        self.input = conv(3, 64)
        self.down1 = down(64, 128)
        self.down2 = down(128, 256)
        self.down3 = down(256, 512)

        bl_fac = 2 if bilinear else 1
        self.down4 = down(512, 1024//bl_fac)

        # Decoder
        self.up1 = up(1024, 512//bl_fac, bilinear)
        self.up2 = up(512, 256//bl_fac, bilinear)
        self.up3 = up(256, 128//bl_fac, bilinear)
        self.up4 = up(128, 64, bilinear)
        self.output = nn.Conv2d(64, 1, kernel_size=1)

    def forward(self, input):
        y1 = self.input(input)
        y2 = self.down1(y1)
        y3 = self.down2(y2)
        y4 = self.down3(y3)
        y5 = self.down4(y4)
        y = self.up1(y5, y4)
        y = self.up2(y, y3)
        y = self.up3(y, y2)
        y = self.up4(y, y1)
        output = self.output(y)
        return output

### Loss Functions

In [None]:
e = 1                                   # just to avoid 0/0, larger value here would make the model less prone to overfitting

def iou_loss(pred, mask):
    mask = mask.squeeze(1).int()        # Batch * 1 * H * W -> Batch * H * W

    pred = pred.squeeze(1)              # Batch * 1 * H * W -> Batch * H * W
    pred = torch.sigmoid(pred)          # not needed if the model returns the output after passing through sigmoid
    pred = (pred >= 0.5).int()          # if greater than 0.5 assign it 1 else 0

    intersection = (pred & mask).float()
    intersection = intersection.sum((1, 2))

    union = (pred | mask).float()
    union = union.sum((1, 2))

    iou = (intersection + e) / (union + e)
    return 1 - iou.mean()

### Training

In [None]:
"""
# The following is a basic training procedure to train the network
# You need to update the code to get the best performance
# TODO: approx ? lines
"""

# Set the hyperparameters
num_epochs = 50
batch_size = 64
learning_rate = 0.001
weight_decay = 1e-4

model = MyModel().cuda()                                                                    # move the model to GPU
loader, _ = get_plane_dataset("train", batch_size)                                          # initialize data_loader
crit1 = nn.BCEWithLogitsLoss()                                                              # Define the loss function
crit2 = iou_loss

In [None]:
optim = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)    # Initialize the optimizer as SGD
total_losses, bce_losses, iou_losses = [], [], []

# start the training procedure
for epoch in range(num_epochs):
    total_loss, bce_total, iou_total = 0, 0, 0
    for (img, mask) in tqdm(loader):
        img, mask = img.cuda(), mask.cuda()
        pred = model(img)

        bce = crit1(pred, mask)
        iou = crit2(pred, mask)

        bce_total += bce.data
        iou_total += iou.data

        loss = bce + iou

        optim.zero_grad()
        loss.backward()
        optim.step()
        total_loss += loss.data
    print("Epoch: {}, Loss: {} ({}, {})".format(epoch, total_loss.cpu()/len(loader), bce_total.cpu()/len(loader), iou_total.cpu()/len(loader)))

    total_losses.append(total_loss.cpu()/len(loader))
    bce_losses.append(bce_total.cpu()/len(loader))
    iou_losses.append(iou_total.cpu()/len(loader))

    torch.save(model.state_dict(), "{}/output/{}_segmentation_model.pth".format(BASE_DIR, epoch))

"""
# Saving the final model
"""
torch.save(model.state_dict(), "{}/output/final_segmentation_model.pth".format(BASE_DIR))

In [None]:
fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(20, 14))
fig.suptitle("The two losses over the epochs and their summation", fontsize=16)

axs[0].plot(range(1, 51), total_losses)
axs[0].set_ylabel("Total Loss")
axs[0].set_xlabel("Epoch")

axs[1].plot(range(1, 51), bce_losses)
axs[1].set_ylabel("BCE Loss")
axs[1].set_xlabel("Epoch")

axs[2].plot(range(1, 51), iou_losses)
axs[2].set_ylabel("IoU Loss")
axs[2].set_xlabel("Epoch")

### Evaluation and Visualization

#### Error Metric

In [None]:
def error_iou(pred, mask):
    mask = mask.squeeze(1).int().cpu().numpy()

    pred = pred.squeeze(1)                      # Batch * 1 * H * W -> Batch * H * W

    pred = torch.sigmoid(pred).cpu().numpy()    # not needed if the model returns the output after passing through sigmoid
    pred = (pred >= 0.5)                        # if greater than 0.5 assign it 1 else 0

    intersection = (pred & mask)
    intersection = intersection.sum((1, 2))

    union = (pred | mask)
    union = union.sum((1, 2))

    iou = (intersection + e) / (union + e)
    return iou.mean()

In [None]:
"""
# Before starting the evaluation, you need to set the model mode to eval
# You may load the trained model again, in case if you want to continue your code later
# TODO: approx 15 lines
"""
batch_size = 8
model = MyModel().cuda()
model.load_state_dict(torch.load("{}/output/final50_segmentation_model.pth".format(BASE_DIR)))
model = model.eval()                # chaning the model to evaluation mode will fix the bachnorm layers
loader, dataset = get_plane_dataset("val", batch_size)

total_iou = 0
for (img, mask) in tqdm(loader):
    with torch.no_grad():
        img = img.cuda()
        mask = mask.cuda()
        pred = model(img)

        """
        ## Complete the code by obtaining the IoU for each img and print the final Mean IoU
        """
        total_iou = total_iou + error_iou(pred, mask)

print("\n #images: {}, Mean IoU: {}".format(len(dataset), total_iou/len(loader)))

In [None]:
"""
# Visualize 3 sample outputs
# TODO: approx 5 lines
"""
vis_num = 3
kernel = np.ones((3, 3), np.uint8)

fig, axs = plt.subplots(nrows=vis_num, ncols=3, figsize=(20, 14))
fig.suptitle("Visualizing 3 instances with their predicted and actual segmentation masks", fontsize=16)

random_idxs = list(np.random.randint(low=0, high=len(dataset)-1, size=vis_num))
print(random_idxs)

axs[0][0].set_title("Image")
axs[0][1].set_title("Segmentation Mask")
axs[0][2].set_title("Predicted Mask")

for idx in range(vis_num):
    img, mask = np.transpose(dataset[random_idxs[idx]][0].numpy(), [1, 2, 0]), dataset[random_idxs[idx]][1].squeeze(0).numpy()

    img = (img - np.min(img)) / (np.max(img) - np.min(img))         # Normalizing the image (for visualization)

    pred_mask = model(dataset[random_idxs[idx]][0].unsqueeze(0).cuda()).squeeze(0).squeeze(0).cpu().detach()
    pred_mask = torch.sigmoid(pred_mask)
    pred_mask = (pred_mask >= 0.5).int().numpy()

    axs[idx][0].imshow(img)
    axs[idx][1].imshow(mask)
    axs[idx][2].imshow(pred_mask)

    axs[idx][0].set_xticks([])
    axs[idx][0].set_yticks([])
    axs[idx][1].set_xticks([])
    axs[idx][1].set_yticks([])
    axs[idx][2].set_xticks([])
    axs[idx][2].set_yticks([])


fig, axs = plt.subplots(nrows=vis_num, ncols=3, figsize=(20, 14))
fig.suptitle("Visualizing 3 instances with their predicted and actual segmentation masks", fontsize=16)

axs[0][0].set_title("Image")
axs[0][1].set_title("Segmentation Mask")
axs[0][2].set_title("Predicted Mask")

for idx in range(vis_num):
    img, mask = np.transpose(dataset[random_idxs[idx]][0].numpy(), [1, 2, 0]), dataset[random_idxs[idx]][1].squeeze(0).numpy()

    img = (img - np.min(img)) / (np.max(img) - np.min(img))         # Normalizing the image (for visualization)

    pred_mask = model(dataset[random_idxs[idx]][0].unsqueeze(0).cuda()).squeeze(0).squeeze(0).cpu().detach()
    pred_mask = torch.sigmoid(pred_mask)
    pred_mask = (pred_mask >= 0.5).int().numpy()
    pred_mask = cv2.dilate(pred_mask.astype("uint8"), kernel)

    axs[idx][0].imshow(img)
    axs[idx][1].imshow(mask)
    axs[idx][2].imshow(pred_mask)

    axs[idx][0].set_xticks([])
    axs[idx][0].set_yticks([])
    axs[idx][1].set_xticks([])
    axs[idx][1].set_yticks([])
    axs[idx][2].set_xticks([])
    axs[idx][2].set_yticks([])

## Part 3: Instance Segmentation

In this part, you need to obtain the instance segmentation results for the test data by using the trained segmentation model in the previous part and the detection model in Part 1.

### Get Prediction

In [None]:
"""
# Define a new function to obtain the prediction mask by passing a sample data
# For this part, you need to use all the previous parts (predictor, get_instance_sample, data preprocessings, etc)
# It is better to keep everything (as well as the output of this funcion) on gpu as tensors to speed up the operations.
# pred_mask is the instance segmentation result and should have different values for different planes.
# TODO: approx 35 lines
"""

model = MyModel().cuda()
model.load_state_dict(torch.load("{}/output/final50_segmentation_model.pth".format(BASE_DIR)))
model = model.eval() # chaning the model to evaluation mode will fix the bachnorm layers

def get_prediction_mask(data):
    width, height = data["width"], data["height"]

    pred_mask = np.zeros((height, width))
    gt_mask = np.zeros((height, width))                                 # Need to initialize this so there is no error for the testing phase

    img = cv2.imread(data["file_name"])

    seg_transforms = transforms.Compose([
        transforms.ToTensor(),                                          # Converting the image to tensor and change the image format (Channels-Last => Channels-First)
        transforms.Normalize(mean = [0.50867474, 0.5278123, 0.5297691],
                             std = [0.22613746, 0.22742623, 0.2254186])
    ])

    # Training and Validation
    if data["annotations"]:
        for idx in range(len(data["annotations"])):
            bbox = data["annotations"][idx]["bbox"]
            bbx, bby, bbw, bbh = [round(item) for item in bbox]

            mask = data["annotations"][idx]["segmentation"]
            mask = GenericMask(mask, data["height"], data["width"]).mask
            mask = mask[bby:bby + bbh, bbx:bbx + bbw]

            gt_mask[bby:bby+bbh, bbx:bbx+bbw] = (idx+1) * mask

            crop_image = np.array(cv2.resize(img[bby:bby+bbh, bbx:bbx+bbw, :], (128, 128), interpolation=cv2.INTER_NEAREST))
            crop_image = seg_transforms(crop_image).cuda()

            pred_mask_temp = torch.sigmoid(model(crop_image.unsqueeze(0))).squeeze(1).squeeze(0).detach().cpu().numpy()
            pred_mask_temp = cv2.resize(pred_mask_temp, (bbw, bbh), interpolation=cv2.INTER_NEAREST)
            pred_mask_temp = np.array(pred_mask_temp >= 0.5, dtype=np.int64)
            pred_mask_temp = cv2.dilate(pred_mask_temp.astype("uint8"), kernel)

            pred_mask[bby:bby+bbh, bbx:bbx+bbw] = (idx+1) * pred_mask_temp
    # Testing
    else:
        divided_images = divide_into_blocks(data, img)

        for height_blk in range(len(divided_images)):
            for width_blk in range(len(divided_images[0])):
                div_img = divided_images[height_blk][width_blk]
                block_height, block_width, _ = div_img.shape

                div_pred_mask = np.zeros((block_height, block_width))
                detectron_pred = mask_predictor(div_img)
                pred_bboxes = detectron_pred["instances"].pred_boxes

                for idx, bbox in enumerate(pred_bboxes):
                    x1, y1, x2, y2 = [round(item) for item in bbox.cpu().numpy().tolist()]
                    # cropped image resized to 128 * 128 * 3
                    crop_image = np.array(cv2.resize(div_img[y1:y2, x1:x2, :], (128, 128), interpolation=cv2.INTER_NEAREST))

                    # applying the transforms
                    crop_image = seg_transforms(crop_image).cuda()

                    pred_mask_temp = torch.sigmoid(model(crop_image.unsqueeze(0))).squeeze(1).squeeze(0).detach().cpu().numpy()
                    pred_mask_temp = cv2.resize(pred_mask_temp, (x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)
                    pred_mask_temp = np.array(pred_mask_temp >= 0.5, dtype=np.int64)
                    pred_mask_temp = cv2.dilate(pred_mask_temp.astype("uint8"), kernel)

                    div_pred_mask[y1:y2, x1:x2] = (idx+1) * pred_mask_temp
                pred_mask[height_blk*block_height:(height_blk+1)*block_height, width_blk*block_width:(width_blk+1)*block_width] = div_pred_mask
    return img, gt_mask, torch.tensor(pred_mask, device="cuda")         # gt_mask could be all zero when the ground truth is not given.

### Visualization and Submission

In [None]:
"""
# Visualise the output prediction as well as the GT Mask and Input image for a sample input
# TODO: approx 10 lines
"""
fig, axs = plt.subplots(nrows=vis_num, ncols=3, figsize=(20, 15))
fig.suptitle("Visualizing 3 random images from test set", fontsize=16)

axs[0][0].set_title("Image")
axs[0][1].set_title("Segmentation Mask")
axs[0][2].set_title("Predicted Mask")

random_test = [item for item in random.sample(DatasetCatalog.get("data_detection_test"), vis_num)]

for idx in range(vis_num):
    img, gt_mask, pred_mask = get_prediction_mask(random_test[idx])
    
    axs[idx][0].imshow(img)
    axs[idx][1].imshow(gt_mask)
    axs[idx][2].imshow(pred_mask.cpu())

    axs[idx][0].set_xticks([])
    axs[idx][0].set_yticks([])
    axs[idx][1].set_xticks([])
    axs[idx][1].set_yticks([])
    axs[idx][2].set_xticks([])
    axs[idx][2].set_yticks([])

In [None]:
"""
# ref: https://www.kaggle.com/rakhlin/fast-run-length-encoding-python
# https://www.kaggle.com/c/airbus-ship-detection/overview/evaluation
"""
def rle_encoding(x):
    """
    x: pytorch tensor on gpu, 1 - mask, 0 - background
    Returns run length as list
    """
    dots = torch.where(torch.flatten(x.long())==1)[0]
    if(len(dots)==0):
        return []
    inds = torch.where(dots[1:]!=dots[:-1]+1)[0]+1
    inds = torch.cat((torch.tensor([0], device=torch.device("cuda"), dtype=torch.long), inds))
    tmpdots = dots[inds]
    inds = torch.cat((inds, torch.tensor([len(dots)], device=torch.device("cuda"))))
    inds = inds[1:] - inds[:-1]
    runs = torch.cat((tmpdots, inds)).reshape((2,-1))
    runs = torch.flatten(torch.transpose(runs, 0, 1)).cpu().data.numpy()
    return " ".join([str(i) for i in runs])

In [None]:
"""
# You need to upload the csv file on kaggle
# The speed of your code in the previous parts highly affects the running time of this part
"""

preddic = {"ImageId": [], "EncodedPixels": []}

"""
# Writing the predictions of the training set
"""
my_data_list = DatasetCatalog.get("data_detection_{}".format("train")) + DatasetCatalog.get("data_detection_{}".format("val"))
for i in tqdm(range(len(my_data_list)), position=0, leave=True):
    sample = my_data_list[i]
    sample["image_id"] = sample["file_name"].split("/")[-1][:-4]
    img, true_mask, pred_mask = get_prediction_mask(sample)
    inds = torch.unique(pred_mask)
    if(len(inds)==1):
        preddic["ImageId"].append(sample["image_id"])
        preddic["EncodedPixels"].append([])
    else:
        for index in inds:
            if(index == 0):
                continue
            tmp_mask = (pred_mask==index)
            encPix = rle_encoding(tmp_mask)
            preddic["ImageId"].append(sample["image_id"])
            preddic["EncodedPixels"].append(encPix)

"""
# Writing the predictions of the test set
"""
my_data_list = DatasetCatalog.get("data_detection_{}".format("test"))
for i in tqdm(range(len(my_data_list)), position=0, leave=True):
    sample = my_data_list[i]
    sample["image_id"] = sample["file_name"].split("/")[-1][:-4]
    img, true_mask, pred_mask = get_prediction_mask(sample)
    inds = torch.unique(pred_mask)
    if(len(inds)==1):
        preddic["ImageId"].append(sample["image_id"])
        preddic["EncodedPixels"].append([])
    else:
        for j, index in enumerate(inds):
            if(index == 0):
                continue
            tmp_mask = (pred_mask==index).double()
            encPix = rle_encoding(tmp_mask)
            preddic["ImageId"].append(sample["image_id"])
            preddic["EncodedPixels"].append(encPix)

pred_file = open("{}/pred.csv".format(BASE_DIR), "w")
pd.DataFrame(preddic).to_csv(pred_file, index=False)
pred_file.close()

## Part 4: Mask R-CNN

For this part you need to follow a same procedure to part 2 with the configs of Mask R-CNN, other parts are generally the same as part 2.

### Data Loader

### Network

In [None]:
cfg = get_cfg()
cfg.OUTPUT_DIR = "{}/output/".format(BASE_DIR)

cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("data_block_detection_train",)
cfg.DATASETS.TEST = ("data_block_detection_val",)
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.SOLVER.IMS_PER_BATCH = 4
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.MAX_ITER = 5000                                                                              # updated from 500
cfg.SOLVER.STEPS = []

cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1                                                                     # plane is the only class

### Training

In [None]:
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

### Evaluation and Visualization

In [None]:
%load_ext tensorboard
%tensorboard --logdir=output

In [None]:
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6                             # 0.7

mask_predictor = DefaultPredictor(cfg)

In [None]:
"""
# Visualize the output for 3 random test samples
# TODO: approx 10 lines
"""
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20, 15))
fig.suptitle("Visualizing 6 random prediction results", fontsize=22)

random_test = [item for item in random.sample(DatasetCatalog.get("data_detection_val"), 6)]
num = 0

for row in range(axs.shape[0]):
    for col in range(axs.shape[1]):
        image = cv2.imread(random_test[num]["file_name"])
        width, height = random_test[num]["width"], random_test[num]["height"]
        full_image = np.zeros(image.shape)
        divided_images = divide_into_blocks(random_test[num], image)

        for height_blk in range(len(divided_images)):
            for width_blk in range(len(divided_images[0])):
                img = divided_images[height_blk][width_blk]
                outputs = mask_predictor(img)
                visualizer = Visualizer(img[:, :, ::-1],
                                    metadata=full_val_meta,
                                    instance_mode=ColorMode.IMAGE_BW)    # scale divides the image (very troublesome)
                visualizer._default_font_size = 4
                out = visualizer.draw_instance_predictions(outputs["instances"].to("cpu"))
                out_img = out.get_image()[:, :, ::-1]
                block_height, block_width, _ = out_img.shape

                full_image[height_blk*block_height:(height_blk+1)*block_height, width_blk*block_width:(width_blk+1)*block_width] = out_img

        # normalizing the image so it does not get clipped by matplotlib
        full_image = (full_image - np.min(full_image)) / (np.max(full_image) - np.min(full_image))

        axs[row, col].imshow(full_image)
        axs[row, col].set_xticks([])
        axs[row, col].set_yticks([])
        num += 1

In [None]:
evaluator = COCOEvaluator("data_block_detection_val", output_dir=cfg.OUTPUT_DIR)
test_loader = build_detection_test_loader(cfg, "data_block_detection_val")
print(inference_on_dataset(trainer.model, test_loader, evaluator))

In [None]:
fig, axs = plt.subplots(nrows=vis_num, ncols=3, figsize=(20, 15))
fig.suptitle("Visualizing 3 random images from test set", fontsize=16)

axs[0][0].set_title("Detection (Part 1)")
axs[0][1].set_title("Instance Segmentation Mask (Part 3)")
axs[0][2].set_title("Mask RCNN IS Mask")
random_test = [item for item in random.sample(DatasetCatalog.get("data_detection_test"), vis_num)]

for idx in range(vis_num):
    img, gt_mask, pred_mask = get_prediction_mask(random_test[idx])

    det_full_image = np.zeros(img.shape)
    mask_full_image = np.zeros(img.shape)

    divided_images = divide_into_blocks(random_test[idx], img)
    for height_blk in range(len(divided_images)):
            for width_blk in range(len(divided_images[0])):
                # Mask RCNN Image
                block_img = divided_images[height_blk][width_blk]
                outputs = mask_predictor(block_img)
                visualizer = Visualizer(block_img[:, :, ::-1],
                                    metadata=full_test_meta,
                                    instance_mode=ColorMode.IMAGE_BW)    # scale divides the image (very troublesome)
                visualizer._default_font_size = 4
                out = visualizer.draw_instance_predictions(outputs["instances"].to("cpu"))
                out_img = out.get_image()[:, :, ::-1]
                block_height, block_width, _ = out_img.shape

                mask_full_image[height_blk*block_height:(height_blk+1)*block_height, width_blk*block_width:(width_blk+1)*block_width] = out_img

                # Detection Image
                outputs = predictor(block_img)
                visualizer = Visualizer(block_img[:, :, ::-1],
                                    metadata=full_test_meta)              # scale divides the image (very troublesome)
                visualizer._default_font_size = 4
                out = visualizer.draw_instance_predictions(outputs["instances"].to("cpu"))
                out_img = out.get_image()[:, :, ::-1]
                block_height, block_width, _ = out_img.shape

                det_full_image[height_blk*block_height:(height_blk+1)*block_height, width_blk*block_width:(width_blk+1)*block_width] = out_img

    # normalizing the image so it does not get clipped by matplotlib
    det_full_image = (det_full_image - np.min(det_full_image)) / (np.max(det_full_image) - np.min(det_full_image))
    mask_full_image = (mask_full_image - np.min(mask_full_image)) / (np.max(mask_full_image) - np.min(mask_full_image))

    axs[idx][0].imshow(det_full_image)
    axs[idx][1].imshow(pred_mask.cpu())
    axs[idx][2].imshow(mask_full_image)

    axs[idx][0].set_xticks([])
    axs[idx][0].set_yticks([])
    axs[idx][1].set_xticks([])
    axs[idx][1].set_yticks([])
    axs[idx][2].set_xticks([])
    axs[idx][2].set_yticks([])