In [1]:
%%capture
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
!pip install ensemble-boxes
!pip install livelossplot
!pip install timm
!pip install mapcalc
import livelossplot
import copy
import random

import math
from mapcalc import calculate_map
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random

import tqdm.notebook as tqdm

import PIL
import os
import collections
import itertools

import torchvision
import torchvision.transforms as transforms


import albumentations as al
from albumentations.pytorch import ToTensorV2
import cv2


import torch
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim


import timm

from sklearn.model_selection import train_test_split
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
import os
import random
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

# Data 

MultiStage Training Strategy:
- 
Two Problems inside of the Data:
- Multiple No Findings(Solution: Train a 2 Part classifier, one training on classes or no classes, other on bbox, the ones with no findings are separated out from the rest of the dataset)
- Multiple Radiographer Readings: We will use NMS to threshold out the bounding boxes so limited duplicates occur).

No problems:
- If there is no finding, all radiographers all say no findings



In [4]:
idx_2_class = {
    0:'Aortic enlargement',
    1:'Atelectasis',
    2:'Calcification',
    3:'Cardiomegaly',
    4:'Consolidation',
    5: 'ILD',
    6: 'Infiltration',
    7: 'Lung Opacity',
    8: 'Nodule/Mass', 
    9: 'Other lesion',
    10: 'Pleural effusion',
    11: 'Pleural thickening',
    12: 'Pneumothorax',
    13: 'Pulmonary fibrosis',
    14: 'No Finding'
}
NUM_OBJ_CLASSES = 14
class_2_idx = {}
for idx in idx_2_class:
    class_2_idx[idx_2_class[idx]] = idx

In [5]:
# HYPER PARAMETERS DEFINED HERE
BATCH_SIZE = 24
TEST_BATCH_SIZE = 24
LR = 1e-4

In [6]:
# CONSTANTS DEFINED HERE
TRAIN_PATH = "../input/vinbigdata-original-image-dataset/vinbigdata/train/"
train_csv = '../input/vinbigdata-original-image-dataset/vinbigdata/train.csv'
test_csv = '../input/vinbigdata-original-image-dataset/vinbigdata/test.csv'
train_state_dict_path = '../input/efficientnetb0/BestLoss.pth'
train_pd = pd.read_csv(train_csv)

In [7]:
class TrainObjectDetectionDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, unique_ids, transforms, train_path, iou_threshold = 0.5):
        self.dataframe = self.parse(dataframe)
        self.transforms = transforms
        self.iou_threshold = iou_threshold
        self.train_path = train_path
        
        self.unique_ids = self.clean_unique_ids(unique_ids) 
    def grab_all_bboxes(self):
        '''
        Retrieves all Bounding boxes as a list, used in K-means clustering.
        '''
        width_height_pairs = []
        for row in tqdm.tqdm(self.dataframe.iterrows()):
            row = row[1]
            width_height_pairs += [((row['x_max'] - row['x_min']) / (row['y_max'] - row['y_min']))]
        return width_height_pairs
        
    def clean_unique_ids(self, unique_ids):
        '''
        Removes all unique_ids that have class 14
        '''
        valid_ids = []
        for id in unique_ids:
            if id in self.dataframe.image_id.values:
                valid_ids += [id]
        return valid_ids
    def cleanse_files(self, unique_ids):
        '''
        Cleanse all invalid files from the self.unique_ids, if any exist
        '''
        valid_id = []
        all_files = os.listdir(self.train_path)
        for id in unique_ids:
            if id + '.jpg' in all_files:
                valid_id += [id]
        return valid_id
    def parse(self, dataframe):
        '''
        Removes all entries with the class id 14(No Findings)
        '''
        indices_needed = dataframe.class_id.values != 14
        return dataframe.loc[indices_needed]
    def __len__(self):
        return len(self.unique_ids)
    def __getitem__(self, idx):
        '''
        Finds all unique bounding boxes related to a given image
        
        Returns YOLO based(cxcywh) bounding boxes that have been NMS thresholded to help prevent duplicate bounding boxes.
        '''
        image_index_needed = self.unique_ids[idx]
        file_path = TRAIN_PATH + image_index_needed + '.jpg'

        
        # Read in Image
        image_loaded = cv2.imread(file_path)
        image_loaded = cv2.cvtColor(image_loaded, cv2.COLOR_BGR2RGB)
        
        # Locate All Images
        image_ids = self.dataframe.image_id.values == image_index_needed
        rows = self.dataframe.loc[image_ids]
        assert len(rows) > 0, collections.Counter(image_ids)
        # Load In the Bounding Boxes
        bboxes = []
        classes = []
        conf_scores = []
        width = 0
        height = 0
        for row in rows.iterrows():
            row = row[1]
            width = row['width']
            height = row['height']
            bboxes += [(row['x_min'], row['y_min'], row['x_max'], row['y_max'])]
            classes += [float(row['class_id'])]
            conf_scores += [1.0]
        
        bboxes = torch.tensor(bboxes)
        classes = torch.tensor(classes)
        conf_scores = torch.tensor(conf_scores)
        
        # NMS Thresh
        vals = torchvision.ops.batched_nms(boxes = bboxes, scores = conf_scores, idxs = classes, iou_threshold = self.iou_threshold)
        
        kept_bboxes = bboxes[vals]
        kept_classes = torch.unsqueeze(classes[vals], -1)
        kept_conf_scores = conf_scores[vals]
        
        # Augment Image and Bounding box using albumentations 
        augmentation = self.transforms(image = image_loaded, bboxes = kept_bboxes, classes = kept_classes)
        aug_bboxes = torch.tensor(augmentation['bboxes'])
        aug_image = torch.tensor(augmentation['image'])
        aug_classes = torch.tensor(augmentation['classes']).unsqueeze(-1)

        # Convert Pascal_VOC 
        YOLO_bboxes = torchvision.ops.box_convert(boxes = aug_bboxes, in_fmt = 'xyxy', out_fmt = 'cxcywh')
        # concatenate classes 
        concat_bboxes = torch.cat([YOLO_bboxes, aug_classes], dim = -1)
        
        return aug_image, concat_bboxes.float()

In [8]:
# Augmentations to use for object detection:
IMAGE_SIZE = 2048
TRAIN_AUGMENTATIONS_OBJ = al.Compose([
    al.RandomResizedCrop(IMAGE_SIZE, IMAGE_SIZE, scale = (0.9, 0.9)),
    al.HorizontalFlip(p=0.5),
    al.HueSaturationValue(p=0.2, hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2),
    al.ShiftScaleRotate(p=0.2, shift_limit=0.0025, scale_limit=0.01, rotate_limit=0),
    
    al.Cutout(p=0.2, max_h_size=16, max_w_size=16, fill_value=(0., 0., 0.), num_holes=16),
    
    al.OneOf([
        al.MotionBlur(blur_limit=(3, 5)),
        al.MedianBlur(blur_limit=5),
        al.GaussianBlur(blur_limit=(3, 5)),
        al.GaussNoise(var_limit=(5.0, 30.0)),
    ], p=0.7),
    
    
    al.RandomGamma(gamma_limit=(70, 130), p=0.3),
    al.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.75),
    
    al.Normalize(),
    ToTensorV2()
], bbox_params = al.BboxParams(format = 'pascal_voc', label_fields = ['classes']))

# Augmentations at Test Time/Val Time
TEST_AUGMENTATIONS_OBJ = al.Compose([
    al.Resize(IMAGE_SIZE, IMAGE_SIZE),
    al.Normalize(),
    ToTensorV2()
], bbox_params = al.BboxParams(format = 'pascal_voc', label_fields = ['classes']))


In [9]:
def unique(list_of_vals):
    '''
    Slow, but reproducible computation of unique values in list.
    '''
    unique = []
    for i in tqdm.tqdm(list_of_vals):
        if i not in unique:
            unique += [i]
    return unique

In [10]:
# Generate Validation and Training Set
unique_ids = unique(train_pd.image_id)
train_ids, val_ids = train_test_split(unique_ids, train_size = 0.99, test_size = 0.01, random_state = 42)

  0%|          | 0/67914 [00:00<?, ?it/s]

In [11]:
# Create Dataset
TrainOBJDataset = TrainObjectDetectionDataset(train_pd, train_ids, TRAIN_AUGMENTATIONS_OBJ, TRAIN_PATH);
ValOBJDataset = TrainObjectDetectionDataset(train_pd, val_ids, TEST_AUGMENTATIONS_OBJ, TRAIN_PATH);

In [12]:
def custom_collate(images):
    '''
    Collates outputs from the torch.utils.data.dataset for object detection(varying GT size)
    '''
    values = list(zip(*images))
    images = values[0]
    bboxes = values[1]
    return torch.stack(images),bboxes 

In [13]:
# Create Dataloaders
TrainOBJDataloader = torch.utils.data.DataLoader(TrainOBJDataset, batch_size = BATCH_SIZE, shuffle = True, collate_fn = custom_collate, worker_init_fn = seed_worker)
ValOBJDataloader = torch.utils.data.DataLoader(ValOBJDataset, batch_size = TEST_BATCH_SIZE, collate_fn = custom_collate, worker_init_fn = seed_worker)

YOLOv3 Model

General Implementation Details:
- EfficientNetB0 BackBone, Extracts some features from the images(pretrained on ImageNet), gard features from all layers 
- Upsample 3 Times and Predicted on 3 Scales 
Not an Exact Copy of YOLOv3, and actually modifies many things(Call it YOLOvAlpha)

In [14]:
# CONSTANTS FOR YOLOv3
INPUT_SIZE = 2048
GRID_SIZE_1 = 4
GRID_SIZE_2 = 16
GRID_SIZE_3 = 64
GRID_SIZES = [GRID_SIZE_1, GRID_SIZE_2, GRID_SIZE_3]
# Defone
NUM_GRIDS = len(GRID_SIZES)
NUM_ANCHORS = 5 # 3 Anchors per scale

Config Classes

In [15]:
class AnchorConfig(nn.Module):
    def __init__(self, num_anchors):
        super().__init__()
        self.num_anchors = num_anchors
        self.kmeans = MiniBatchKMeans(n_clusters = self.num_anchors, batch_size = 200, max_iter = 1000, max_no_improvement = 50)
    def train_k_means(self, bounding_boxes):
        '''
        Trains a K-Means classifier to find rough bounding box sizes.
        Input: List of (width, height) values
        '''
        # Convert Bounding Boxes to np.Array
        bbox = np.expand_dims(np.array(bounding_boxes), -1)
        self.kmeans = self.kmeans.fit(bbox)
    def _get_anchors(self):
        '''
        Represents the ratio between width and height on the bounding box
        '''
        return np.squeeze(self.kmeans.cluster_centers_).tolist()

In [16]:
anchors = AnchorConfig(NUM_ANCHORS)

# Grab Bounding Boxes and Fit on the Training Set
bounding_boxes = TrainOBJDataset.grab_all_bboxes()

anchors.train_k_means(bounding_boxes)

ANCHOR_BOXES = anchors._get_anchors()

|          | 0/? [00:00<?, ?it/s]

In [17]:
class YOLOv3Config(nn.Module):
    '''
    Key functions needed for YOLOv3
    
    Note that since most of the operations involve Bounding Boxes, which varies in batches, all of this is not batched.
    '''
    def __init__(self, input_size, NUM_ANCHORS, NUM_GRIDS, device):
        super().__init__()
        self.input_size = input_size
        self.NUM_ANCHORS = NUM_ANCHORS
        self.NUM_GRIDS = NUM_GRIDS
        self.device = device
    def forward(self, GRID_SIZES, ANCHOR_BOXES):
        '''
        input: (image, GRID_SIZES, anchor_boxes)
        Grid_SIZES: list of Grid Sizes
        High Level function that generates a grid and anchor boxes for you.
        '''
        grids = []
        anchors = []
        for GRID_SIZE in GRID_SIZES:
            # Generate Anchor Boxes and Grid Cells 
            grid = self.generate_grid(GRID_SIZE) # (S, S, 4)
            anchor_boxes = self.generate_anchors(grid, ANCHOR_BOXES, GRID_SIZE) # (NUM_ANCHORS, S, S, 4) 
            grids += [grid]
            anchors += [anchor_boxes]
        return grids, anchors
    def forward_GT(self, GRIDS_SIZES, ANCHOR_BOXES, GTS):
        '''
        GTS: (N, 5)
        '''
        grids, anchors = self(GRIDS_SIZES, ANCHOR_BOXES)
        scales = []
        for i in range(len(grids)):
            grid = grids[i]
            anchor = anchors[i]
            indices, GT = self._assign_bbox(GTS, anchor)
            # Convert the GT's back to cxcywh model
            GT_classes = GT[:, -1].unsqueeze(-1)
            GT = self._VOC2YOLO(GT)
            GT = torch.cat([GT, GT_classes], dim = -1)
            
            if len(list(indices.shape)) == 1:
                # Empty bounding boxes
                scales += [(grid, anchor, None, None, GT)]
            else:
                GT_deltas = self._find_bbox_scale(GT, indices, anchor)
                scales += [(grid, anchor, indices, GT_deltas, GT)]
        return scales
    def _lookup_anchors(self, anchors, anchor_idx):
        '''
        Helps to lookup the anchors needed given anchor indices
        anchors(NUM_ANCHORS, GRID_SIZE, GRID_SIZE, 4)
        anchor_idx: Tensor(N, 3)
        '''
        x_coords = anchor_idx[:, 0]
        y_coords = anchor_idx[:, 1]
        type_idx = anchor_idx[:, 2]
        return anchors[type_idx, x_coords, y_coords, :]
        
    def _find_bbox_scale(self, GT_BBOX, anchor_box_idx, anchor_boxes):
        '''
        Determines what cx, cy, w, and h are the GT to map from anchor box to Ground Truth 
        GT_BBOX: (N, 5)
        anchor_box_idx: Tensor(N, 3), where indices are expressed as (x, y, type)
        anchor_boxes: Tensor(NUM_ANCHORS, S, S, 4)
        '''
        GT_classes = GT_BBOX[:, -1]
        GT_BBOX = GT_BBOX[:, :-1]
        Anchors_Needed = self._lookup_anchors(anchor_boxes, anchor_box_idx)
        '''
        Recall the change parameter
        x prime = x + delta x
        y prime = y + delta y
        w prime = w * e^delta w
        h prime = h * e^delta h
        Thus,
        delta x= x prime - x
        delta y = y prime - y
        delta w = ln(wprime / w)
        delta h = ln(hprime / h) 
        '''
        delta_x = GT_BBOX[:, 0] - Anchors_Needed[:, 0]
        delta_y = GT_BBOX[:, 1] - Anchors_Needed[:, 1]
        delta_w = torch.log(GT_BBOX[:, 2] / Anchors_Needed[:, 2])
        delta_h = torch.log(GT_BBOX[:, 3] / Anchors_Needed[:, 3])
        GroundTruthDeltas = torch.cat([delta_x.unsqueeze(-1), delta_y.unsqueeze(-1), delta_w.unsqueeze(-1), delta_h.unsqueeze(-1), GT_classes.unsqueeze(-1)], dim = -1)
        return GroundTruthDeltas
    def _find_x_and_y(self, GT_BBOXES, anchors_boxes):
        '''
        Computes the x and y index needed in the anchor boxes
        GT_BBOXES: (N, 5)
        Anchor_Boxes: (NUM_ANCHORS, GRID_SIZE, GRID_SIZE, 4) 
        or: (NUM_ANCHORS, Columns, Rows, 4)
        '''
        _, GRID_SIZE, _, _ = anchors_boxes.shape
        cell_size = self.input_size / GRID_SIZE # Size of each cell(cell_size, cell_size)
        N, _ = GT_BBOXES.shape
        indices = []
        for n in range(N):
            bbox = GT_BBOXES[n, :-1] # (4)
            indices += [(bbox[0] // cell_size, bbox[1] // cell_size)]
        return torch.tensor(indices, device = GT_BBOXES.device)
    def convert_image(self, images):
        images = images.to(torch.float32) / 255.0
        return images
    def _assign_bbox(self, GT_BBOXES, anchor_boxes):
        '''
        returns the index associated with Each BBoxes.
        GT_BBOXES: (N, 5), the Ground Truth Bboxes
        Anchor_Boxes: (NUM_ANCHORS, GRID_SIZE, GRID_SIZE, 4), a grid of any scale 
        '''
        _, GRID_SIZE, _, _ = anchor_boxes.shape
        x_y_coord = self._find_x_and_y(GT_BBOXES, anchor_boxes).to(torch.long) # (N, 2) (Column, Row)
        columns = x_y_coord[:, 0]
        rows = x_y_coord[:, 1] 
        
        potential_anchors = anchor_boxes[:, columns, rows, :] # (NUM_ANCHORS, N, 4)
        # GT_BBOXES: (N, 5)
        GT_classes = GT_BBOXES[:, -1]
        GT_BBOXES = GT_BBOXES[:, :-1] # (N, 4)
        # Convert to VOC Coords
        GT_BBOXES = torchvision.ops.box_convert(GT_BBOXES, in_fmt = 'cxcywh', out_fmt = 'xyxy')
        potential_anchors = torchvision.ops.box_convert(potential_anchors, in_fmt = 'cxcywh', out_fmt = 'xyxy')
        # Select Largest IOU
        N, _ = GT_BBOXES.shape
        selected_box_idx = []
        kept_GT = []
        kept_GT_class= []
        for n in range(N):
            GT = GT_BBOXES[n, :].unsqueeze(0) # (1, 4)
            potential = potential_anchors[:, n, :] # (N, 4)
            iou = torchvision.ops.box_iou(GT, potential)
            max_iou, max_idx = torch.max(iou, dim = 1)
            if max_iou.item() >= 0.2: # We say that 0.2 Threshold for GT_Boxes 
                selected_box_idx += [(columns[n], rows[n], max_idx.item())]
                kept_GT += [GT]
                kept_GT_class += [GT_classes[n].unsqueeze(0)]
            else:
                pass
                #print(f"WARNING: GT has no Adequate Bounding Box. IOU: {max_iou.item()}, GT: {GT}, cur_GR: {GRID_SIZE}")
        selected_box_idx = torch.tensor(selected_box_idx, device = anchor_boxes.device)
        if len(kept_GT) == 0:
            return selected_box_idx, torch.zeros((0, 5), device = self.device)
        kept_GT = torch.cat(kept_GT)
        kept_GT_class = torch.cat(kept_GT_class).unsqueeze(-1)
        return selected_box_idx, torch.cat([kept_GT, kept_GT_class], dim = -1)
    def scale_up(self, bboxes):
        return bboxes * self.input_size
    def scale_down(self, bboxes):
        return bboxes / self.input_size
    def apply_GT_deltas(self, bboxes, deltas):
        '''
        bboxes: Tensors(N, 5)
        deltas: Tensors(N, 4)
        '''
        x_bbox = bboxes[:, 0]
        y_bbox = bboxes[:, 1]
        w_bbox = bboxes[:, 2]
        h_bbox = bboxes[:, 3]
        
        delta_x = deltas[:, 0]
        delta_y = deltas[:, 1]
        delta_w = deltas[:, 2]
        delta_h = deltas[:, 3]
        
        x_prime = x_bbox + delta_x
        y_prime = y_bbox + delta_y
        w_prime = w_bbox * torch.exp(delta_w)
        h_prime = h_bbox * torch.exp(delta_h)
        
        new_bbox = torch.cat([x_prime.unsqueeze(-1), y_prime.unsqueeze(-1), w_prime.unsqueeze(-1), h_prime.unsqueeze(-1), bboxes[:,-1].unsqueeze(-1)], axis = 1)
        return new_bbox
    
    def visualize_bbox(self, image, bboxes):
        '''
        Visualizes bounding boxes on top of an image
        image: Tensor(3, H, W)
        bboxes: (N, 5), in YOLO coordinates
        '''
        VOC = self._YOLO2XYXY(bboxes) # (N, 4)
        fig, ax = plt.subplots()
        N, _ = VOC.shape
        # Display the image
        ax.imshow(image.transpose(0, 1).transpose(1, 2))
        for n in range(N):
            bbox = VOC[n, :]
            rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], linewidth = 1, alpha = 1, fill = False)
            ax.add_patch(rect);
        plt.show();
    
    def visualize_grid(self, image, grid):
        '''
        Visualizes one image and its grid.
        Plots Grid as dots, and not bbox 
        image: Tensor(3, H, W)
        grid: Tensor(N, 4), all bounding boxes
        '''
        N, _ = grid.shape 
        plt.imshow(image.transpose(0, 1).transpose(1, 2))
        for n in range(N):
            bbox = grid[n, :] 
            plt.scatter([bbox[0]], [bbox[1]])
        plt.show();
    def generate_anchors(self, grid, anchor_boxes, GRID_SIZE):
        '''
        Generates anchor boxes, in the form of (cx, cy, w, h) 
        grid: (S, S, 4), set of grid centers
        anchor_boxes: List of Anchor sizes, all a tuple(w, h)
        grid_size: The size of each grid cell(The scale to use)
        '''
        # Compute the grid size
        grid_size = self.input_size / GRID_SIZE
        anchors = []
        C, R, _ = grid.shape
        for width in anchor_boxes:
            anchor_set = []
            for c in range(C):
                anchor_rows = []
                for r in range(R):
                    anchor = copy.deepcopy(grid[c, r, :]) # (4)
                    anchor[2] = width * grid_size 
                    anchor[3] = grid_size
                    anchor_rows += [anchor]
                anchor_set += [torch.stack(anchor_rows)]
            anchors += [torch.stack(anchor_set)]
        return torch.stack(anchors)
    def _batch_tensors(self, tensor, batch_size):
        '''
        Repeats a tensor across the batch dimension
        tensor(...), outputs: Tensor(B, ...)
        '''
        repeated = torch.repeat_interleave(tensor, batch_size, axis = 0)
        return repeated
    def generate_grid(self, grid_size):
        '''
        Creates a Grid for the object detector(grid cells)
        Returns a grid of size(S, S, 2) # (cx, cy, w, h) for every grid cell.
        '''
        grid_box_width = (1 / grid_size) * self.input_size
        num_grid_width = self.input_size // grid_size
        grid_box_height = (1 / grid_size) * self.input_size
        num_grid_height = self.input_size // grid_size
        
        grid_centers = []
        for w in range(0, grid_size):
            for h in range(0, grid_size):
                cx = w * grid_box_width + grid_box_width / 2
                cy = h * grid_box_height + grid_box_height / 2
                grid_centers += [(cx, cy, grid_box_width, grid_box_height)]
        grid_centers = torch.tensor(grid_centers, device = self.device) 
        grid_centers = grid_centers.view(grid_size, grid_size, -1) # (Column, Rows, 4)
        return grid_centers
    def _convert_bbox(self, bbox):
        '''
        Inflates a bounding box with 4 values(N, 4) to 5 where all classes are just 0(Helpful for testing)
        '''
        N, _ = bbox.shape
        vals = torch.zeros((N, 5))
        vals[:, :-1] = bbox
        return vals;
    def _YOLO2XYXY(self, YOLO):
        '''
        YOLO: Tensor(N, 5)
        '''
        YOLO = YOLO[:, :-1] # (N, 4)
        XYXY = torchvision.ops.box_convert(YOLO, in_fmt = 'cxcywh', out_fmt = 'xywh')
        return XYXY
    def _VOC2YOLO(self, VOC):
        '''
        Helper Function to Convert a VOC based Bounding Box into a YOLO based
        VOC: Tensor(N, 5)
        '''
        VOC = VOC[:, :-1] # (N, 4) 
        YOLO = torchvision.ops.box_convert(VOC, in_fmt = 'xyxy', out_fmt = 'cxcywh')
        return YOLO
    def _YOLO2VOC(self, YOLO):
        '''
        Converts a YOLO based bounding box to a VOC based bounding box.
        YOLO: Tensor(N, 5)
        '''
        YOLO = YOLO[:, :-1] # (N, 4)
        VOC = torchvision.ops.box_convert(YOLO, in_fmt = 'cxcywh', out_fmt = 'xyxy')
        return VOC

# YOLOv3 Model:

Some Useful Blocks

In [18]:
class ConvBlock(nn.Module):
    def __init__(self, in_features, out_features, kernel_size, padding, groups):
        super().__init__()
        self.conv = nn.Conv2d(in_features, out_features, kernel_size, padding = padding, groups = groups)
        self.SILU = nn.SiLU(inplace = True)
        self.bn = nn.BatchNorm2d(out_features)
    def forward(self, x):
        return self.bn(self.SILU(self.conv(x)))

In [19]:
class ConvSqueezeExcite(nn.Module):
    def __init__(self, in_features, inner_features):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.Squeeze = nn.Conv2d(self.in_features, self.inner_features, kernel_size = 1)
        self.Excited = nn.Conv2d(self.inner_features, self.in_features, kernel_size = 1)
        self.SILU = nn.SiLU(inplace = True)
    def forward(self, x):
        '''
        x: Tensor(B, C, H, W)
        '''
        squeezed = self.SILU(self.Squeeze(x))
        excited = torch.sigmoid(self.Excited(squeezed))
        return excited * x

In [20]:
class DownSampleConvBlock(nn.Module):
    def __init__(self, in_features, out_features, kernel_size, padding, stride, groups):
        super().__init__()
        self.conv = nn.Conv2d(in_features, out_features, kernel_size, padding = padding, stride = stride, groups = groups)
        self.SiLU = nn.SiLU(inplace = True)
        self.bn = nn.BatchNorm2d(out_features)
    def forward(self, x):
        return self.bn(self.SiLU(self.conv(x)))

In [21]:
class RegularSE(nn.Module):
    '''
    Normal Squeeze and Excitation Block 
    '''
    def __init__(self, in_features, squeezed_features):
        super().__init__()
        self.in_features = in_features
        self.squeezed_features = squeezed_features
        self.Squeeze = nn.Linear(self.in_features, self.squeezed_features)
        self.act1 = nn.SiLU(inplace = True)
        self.Expand = nn.Linear(self.squeezed_features, self.in_features)
    def forward(self, x):
        '''
        x: Tensor(B, C, H, W)
        '''
        # Max Pool over last 2 dims
        max_pooled, _ = torch.max(x, dim = -1)
        max_pooled, _ = torch.max(max_pooled, dim = -1) # (B, C)
        # Squeeze and Excitation Network
        squeezed = self.act1(self.Squeeze(max_pooled))
        excited = torch.sigmoid(self.Expand(squeezed)).unsqueeze(-1).unsqueeze(-1)
        return excited * x
        

In [22]:
class BottleNeck(nn.Module):
    '''
    Squeeze Excite Residual Block as Proposed in ResNet.
    '''
    def __init__(self, input_size, inner_size, device):
        super().__init__()
        self.device = device
        self.input_size = input_size
        self.inner_size = inner_size
        self.Squeeze = ConvBlock(self.input_size, self.inner_size, 1, 0, 1)
        self.Process = ConvBlock(self.inner_size, self.inner_size, 3, 1, 1)
        self.Expand = ConvBlock(self.inner_size, self.input_size, 1, 0, 1)
        self.SE = RegularSE(self.input_size, self.input_size // 16)
        self.gamma = nn.Parameter(torch.zeros(1, device = self.device))
    def forward(self, x):
        squeezed = self.Squeeze(x)
        processed = self.Process(squeezed)
        expand = self.Expand(processed)
        excited = self.SE(expand)
        return self.gamma * excited + x
class DownConvolutionBlock(nn.Module):
    '''
    Uses very cheap operations to process and downconvolve the massive image
    '''
    def __init__(self, device):
        super().__init__()
        self.device = device
        self.in_features = 3
        self.avgPool = nn.AvgPool2d(kernel_size = 5, padding = 2, stride = 2)
        self.downConv = DownSampleConvBlock(3, 5, 3, 1, 2, 1) # (2048 -> 1024)
        
        self.downConv2 = DownSampleConvBlock(8, 16, 3, 1, 2, 1) # (1024 -> 512)
        
        self.downConv3 = DownSampleConvBlock(24, 64, 3, 1, 2, 1) # (512 -> 256)
        self.process3 = nn.Sequential(*[
            BottleNeck(64, 16, self.device) for i in range(3) # A little bit of processing
        ])
        
        self.proj = nn.Sequential(*[
            ConvBlock(64, 32, 1, 0, 1),
            ConvBlock(32, 3, 1, 0, 1)])
        self.gamma = nn.Parameter(torch.zeros((1), device = self.device))
        
    def forward(self, x):
        '''
        Initial DownConvolution
        x: Tensor(B, 3, 2048, 2048)
        '''
        B, _, _, _ = x.shape
        interpolated = F.interpolate(x, (256, 256), mode = 'bilinear')
        avgPool = self.avgPool(x) # (B, 3, 1024, 1024)
        downConv = self.downConv(x) # (B, 5, 1024, 1024)
        # Concatenate Features
        concatted = torch.cat([downConv, avgPool], dim = 1) # (B, 8, 1024, 1024)
        # DownConv again 
        avgPool2 = self.avgPool(concatted) # (B, 8, 512, 512)
        downConv2 = self.downConv2(concatted) # (B, 32, 512, 512)
        concatted2 = torch.cat([downConv2, avgPool2], dim = 1) # (B, 40, 512, 512)
        # Conv Stride a Few times
        conv3 = self.process3(self.downConv3(concatted2)) # (B, 64, 256, 256)  
        proj = self.proj(conv3) # (B, 64, 128, 128)
        return proj * self.gamma + interpolated

In [23]:
class DownConvolutionBlock(nn.Module):
    '''
    Uses very cheap operations to process and downconvolve the massive image
    '''
    def __init__(self, device):
        super().__init__()
        self.device = device
        self.in_features = 3
        self.avgPool = nn.AvgPool2d(kernel_size = 5, padding = 2, stride = 2)
        self.downConv = DownSampleConvBlock(3, 5, 3, 1, 2, 1) # (2048 -> 1024)
        
        self.downConv2 = DownSampleConvBlock(8, 16, 3, 1, 2, 1) # (1024 -> 512)
        
        self.downConv3 = DownSampleConvBlock(24, 64, 3, 1, 2, 1) # (512 -> 256)
        self.process3 = nn.Sequential(*[
            BottleNeck(64, 16, self.device) for i in range(3) # A little bit of processing
        ])
        
        self.proj = nn.Sequential(*[
            ConvBlock(64, 32, 1, 0, 1),
            ConvBlock(32, 3, 1, 0, 1)])
        self.gamma = nn.Parameter(torch.zeros((1), device = self.device))
        
    def forward(self, x):
        '''
        Initial DownConvolution
        x: Tensor(B, 3, 2048, 2048)
        '''
        B, _, _, _ = x.shape
        interpolated = F.interpolate(x, (256, 256), mode = 'bilinear')
        avgPool = self.avgPool(x) # (B, 3, 1024, 1024)
        downConv = self.downConv(x) # (B, 5, 1024, 1024)
        # Concatenate Features
        concatted = torch.cat([downConv, avgPool], dim = 1) # (B, 8, 1024, 1024)
        # DownConv again 
        avgPool2 = self.avgPool(concatted) # (B, 8, 512, 512)
        downConv2 = self.downConv2(concatted) # (B, 32, 512, 512)
        concatted2 = torch.cat([downConv2, avgPool2], dim = 1) # (B, 40, 512, 512)
        # Conv Stride a Few times
        conv3 = self.process3(self.downConv3(concatted2)) # (B, 64, 256, 256)  
        proj = self.proj(conv3) # (B, 64, 128, 128)
        return proj * self.gamma + interpolated

In [24]:
class InvertedResidualBlock(nn.Module):
    def __init__(self, in_features, inner_features, device):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.device = device
        self.expand = ConvBlock(self.in_features, self.inner_features, 1, 0, 1)
        self.depthwise = ConvBlock(self.inner_features, self.inner_features, 3, 1, self.inner_features)
        self.SE = RegularSE(self.inner_features, self.inner_features // 16)
        self.squeeze = ConvBlock(self.inner_features, self.in_features, 1, 0, 1)
        self.gamma = nn.Parameter(torch.zeros((1), device = self.device))
    def forward(self, x):
        expanded = self.expand(x)
        depthwise = self.depthwise(expanded)
        se = self.SE(depthwise)
        squeezed = self.squeeze(se)
        return self.gamma * squeezed + x

In [25]:
class TransposedConvBlock(nn.Module):
    '''
    Same as ConvBlock, but now allows for transposed convolution 
    '''
    def __init__(self, in_features, out_features, kernel_size, output_padding, stride):
        super().__init__()
        self.convT = nn.ConvTranspose2d(in_features, out_features, kernel_size = kernel_size, output_padding = output_padding, stride = stride)
        self.act1 = nn.SiLU(inplace = True)
        self.bn1 = nn.BatchNorm2d(out_features)
    def forward(self, x):
        return self.bn1(self.act1(self.convT(x)))

Model Blocks(Larger Components)

In [26]:
class ModifiedEfficientNetStudent(nn.Module):
    '''
    Student uses Down Convolutional Block to quickly downsample super large images.
    '''
    def freeze(self, layer):
        for parameter in layer.parameters():
            parameter.requires_grad = False
    def unfreeze(self, layer):
        for parameter in layer.parameters():
            parameter.requires_grad = True
    def __init__(self, num_classes, device, model_name = 'efficientnet_b3_pruned', drop_prob = 0.0):
        super().__init__()
        self.num_classes = num_classes 
        self.device = device
        self.model_name = model_name
        self.drop_prob = drop_prob
        self.model = timm.create_model(self.model_name, pretrained = False)
        
        # Extract Layers
        self.downsampled = DownConvolutionBlock(self.device)
        self.conv1 = self.model.conv_stem
        self.bn1 = self.model.bn1
        self.act1 = self.model.act1
        
        self.block0 = self.model.blocks[0]
        self.block1 = self.model.blocks[1]
        self.block2 = self.model.blocks[2]
        self.block3 = self.model.blocks[3]
        self.block4 = self.model.blocks[4]
        self.block5 = self.model.blocks[5]
        self.block6 = self.model.blocks[6]
        
        # Custom Layers
        self.Attention1 = RegularSE(12, 4)
        self.Attention2 = RegularSE(40, 16)
        self.Attention3 = RegularSE(120, 32)
        self.Attention4 = RegularSE(384, 64)
        # Freeze Initial Layers
        self.freeze(self.downsampled)
        self.freeze(self.conv1)
        self.freeze(self.bn1)
        
        self.freeze(self.block0)
        self.freeze(self.block1)
        self.freeze(self.block2)
        self.freeze(self.block3)
        self.freeze(self.block4)
        self.freeze(self.Attention1)
        self.freeze(self.Attention2)
        self.freeze(self.Attention3)
        
        
        self.layer4 = nn.Sequential(*[
            DownSampleConvBlock(384, 384, 5, 2, 2, 384), # (B, 320, 4, 4)
            ConvBlock(384, 768, 1, 0, 1)] + # (B, 768, 4, 4)
        [
            InvertedResidualBlock(768, 1536, self.device) for i in range(5)
         
        ]
        )
        self.conv2 = ConvBlock(768, 1536, 1, 0, 1)
        self.global_avg = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(self.drop_prob)
        self.Linear = nn.Linear(1536, self.num_classes)
    def forward(self, x):
        '''
        x: Tensor(B, 3, 320, 320)
        '''
        downsampled = self.downsampled(x) # (B, 3, 256, 256)
        conv1 = self.bn1(self.act1(self.conv1(downsampled))) # (B, 32, 256, 256)
        # Extract Features
        block0 = self.block0(conv1)
        block1 = self.block1(block0) # (B, 24, 64, 64) 
        attention1 = self.Attention1(block1)
        
        block2 = self.block2(attention1) # (B, 40, 32, 32)
        attention2 = self.Attention2(block2)
        
        block3 = self.block3(attention2)
        block4 = self.block4(block3) # (B, 112, 16, 16)
        attention3= self.Attention3(block4)
        
        block5 = self.block5(attention3)
        block6 = self.block6(block5) # (B, 320, 8, 8)
        attention4 = self.Attention4(block6)
        # Custom Layer 4
        layer4 = self.layer4(attention4) # (B, 512, 4, 4)
        return attention1, attention3, layer4

In [27]:
class ClassificationAlpha(nn.Module):
    def __init__(self, device, file_path = None):
        super().__init__()
        self.file_path = file_path
        self.num_classes = 1
        self.device = device
        self.model = ModifiedEfficientNetStudent(self.num_classes, self.device, drop_prob = 0.1)
        if file_path != None:
            self.load_state_dict(torch.load(self.file_path, map_location = self.device))
    def forward(self, x):
        return self.model(x)

In [28]:
class ObjectDetectorQT(nn.Module):
    def __init__(self, in_features, lower_in_features, inner_features, num_obj_classes, device, num_anchors = 3):
        super().__init__()
        self.inner_features = inner_features
        self.lower_in_features = lower_in_features
        self.in_features = in_features
        self.num_classes = num_obj_classes
        self.num_anchors = num_anchors
        self.device = device
        
        # Projection Layers to map embeddings to the correct size
        self.proj = TransposedConvBlock(self.lower_in_features, self.in_features, 1, 1, 2)
        self.proj2 = ConvBlock(2 * self.in_features, self.inner_features, 1, 0, 1)
        
        self.ConvPipeLine = nn.Sequential(*[
            BottleNeck(self.inner_features, self.inner_features // 4, self.device) for i in range(6)
        ])
        
        self.predict = nn.Conv2d(self.inner_features, self.num_anchors * (5 + self.num_classes), kernel_size = 1)
    def forward(self, x, lower_x):
        '''
        x: Tensor(B, C, H, W), 
        lower_x: Tensor(B, C_prime, H_prime, W_prime), features from the previous Object Detector Layer
        '''
        # project lower_x into x
        proj_x = self.proj(lower_x) # (B, in_features, H, W)
        # Concatenate the features
        concatenated = torch.cat([x, proj_x], dim = 1) # (B, self.in_features * 2, H, W) 
        # Project into ConvPipeLine
        proj_conv = self.proj2(concatenated) 
        # Run through convPipeLine for features
        features = self.ConvPipeLine(proj_conv)
        return self.predict(features), features

In [29]:
class ObjectDetectorAlpha(nn.Module):
    def __init__(self, in_features, inner_features, num_obj_classes, device, num_anchors = 3):
        super().__init__()
        self.in_features = in_features
        self.inner_features = inner_features
        self.num_classes = num_obj_classes
        self.num_anchors = num_anchors
        self.device = device
        
        self.proj = ConvBlock(self.in_features, self.inner_features, 1, 0, 1)
        
        self.ConvPipeLine = nn.Sequential(*[
            BottleNeck(self.inner_features, self.inner_features // 4, self.device) for i in range(6)
        ])
        
        self.predict = nn.Conv2d(self.inner_features, self.num_anchors * (5 + self.num_classes), kernel_size = 1)
    def forward(self, x):
        '''
        x: Tensor(B, C, H, W), 
        '''
        proj_conv = self.proj(x) 
        # Extract Features
        features = self.ConvPipeLine(proj_conv)
        return self.predict(features), features

In [30]:
class YOLOQT(nn.Module):
    def __init__(self, num_obj_classes, num_anchors, device, file_path = None):
        super().__init__()
        self.num_anchors = num_anchors
        self.num_obj_classes = num_obj_classes
        self.device = device
        self.file_path = file_path
        
        self.feature_extractor = ClassificationAlpha(self.device, file_path = self.file_path)
        
        # Object Detector Layers
        
        self.obj_detect_1 = ObjectDetectorAlpha(768, 256, self.num_obj_classes, self.device, num_anchors = self.num_anchors)
        self.obj_detect_2 = ObjectDetectorQT(248, 128, 256, self.num_obj_classes, self.device, num_anchors = self.num_anchors)
        self.obj_detect_3 = ObjectDetectorQT(76, 128, 128, self.num_obj_classes, self.device, num_anchors = self.num_anchors)
        
        # Upsample and Further Processing Layers
        
        self.upsample_3 = TransposedConvBlock(768, 256, 1, 1, 2)
        self.process_3 = BottleNeck(256, 32, self.device)
        self.upsample3_5 = TransposedConvBlock(256, 128, 1, 1, 2)
        self.process3_5 = BottleNeck(128, 16, self.device) 
        self.sequential_3 = nn.Sequential(*[
            BottleNeck(248, 64, self.device) for i in range(3)
        ])
        
        self.upsample_2 = TransposedConvBlock(248, 128, 1, 1, 2)
        self.process_2 = BottleNeck(128, 32, self.device)
    
        self.upsample_1 = TransposedConvBlock(128, 64, 1, 1, 2)
        self.process_1 = BottleNeck(64, 16, self.device)
        
        
        
        self.upsample_small = TransposedConvBlock(256, 128, 1, 1, 2)
        self.process_small = BottleNeck(128, 32, self.device)
        self.upsample_middle = TransposedConvBlock(256, 128, 1, 1, 2)
        self.process_middle = BottleNeck(128, 32, self.device)
        self.sequential_1 = nn.Sequential(*[
            BottleNeck(76, 24, self.device) for i in range(3)
        ])
        
    def forward(self, x):
        layer2, layer3, layer4 = self.feature_extractor(x)
        # Layer2: Tensor(B, 24, 64, 64)
        # Layer3: Tensor(B, 112, 16, 16)
        # Layer4: Tensor(B, 768, 4, 4) 
        # Make Large Predictions
        bbox_pred_large, features_large = self.obj_detect_1(layer4)
        # features_large: Tensor(B, 256, 4, 4)
        # Upsample and Process Layer 4
        upsampled_3 = self.upsample_3(layer4) # (B, 256, 8, 8)
        processed_3 = self.process_3(upsampled_3) 
        upsampled_3 = self.upsample3_5(processed_3) # (B, 128, 16, 16)
        processed_3 = self.process3_5(upsampled_3) # (B, 128, 16, 16)
        
        upsample_layer3 = self.upsample_small(features_large) # (B, 128, 8, 8)
        processed_layer3 = self.process_small(upsample_layer3) # (B, 128, 8, 8)
        # Concatenate
        concat_3 = torch.cat([layer3, processed_3], dim = 1) # (B, 264, 16, 16)
        sequential_3 = self.sequential_3(concat_3) # (B, 264, 16, 16)
        # Object Detector 3(Medium Predictions)
        bbox_pred_medium, features_medium = self.obj_detect_2(sequential_3, processed_layer3) 
        # Features_medium: Tensor(B, 256, 16, 16) 
        # Upsample and Process Layer 3
        upsampled_2 = self.upsample_2(sequential_3) 
        processed_2 = self.process_2(upsampled_2) # (B, 128, 32, 32)
        upsampled_1 = self.upsample_1(processed_2)
        processed_1 = self.process_1(upsampled_1) # (B, 64, 64, 64)
        # Concatenate
        concat_1 = torch.cat([layer2, processed_1], dim = 1) # (B, 96, 64, 64) 
        sequential_1 = self.sequential_1(concat_1) # (B, 96, 64, 64) 
        
        # Process Features Medium
        upsample_medium = self.upsample_middle(features_medium)
        processed_medium = self.process_middle(upsample_medium) # (B, 128, 32, 32)
        # Object Detector 2(Small Predictions)
        bbox_pred_small, features_small = self.obj_detect_3(sequential_1, processed_medium) 
        
        return bbox_pred_small, bbox_pred_medium, bbox_pred_large
        # Bbox_Small: Tensor(B, anchors(5 + C), S_small, S_small), BBox_Medium: Tensor(B, anchors(5 + C), S_medium, S_medium), Bbox_Large: Tensor(B, anchors(5 + C), S_large, S_large)
        

Helper Classes for Training

In [31]:
class YOLOv3LossConfig(nn.Module):
    '''
    Helper Class that configurates the loss
    '''
    def __init__(self, input_size, anchor_sizes, grid_sizes, num_obj_classes, device):
        super().__init__()
        self.input_size = input_size
        self.anchor_sizes = anchor_sizes
        self.grid_sizes = grid_sizes
        self.num_obj_classes = num_obj_classes 
        self.device = device       
        
        self.num_anchors = len(self.anchor_sizes)
        self.num_grids = len(self.grid_sizes)
        
        self.Yconfig = YOLOv3Config(self.input_size, self.num_anchors, self.num_grids, self.device)
        # Variety of Loss Functions
        self.obj_crit = nn.BCEWithLogitsLoss()
        self.MSELoss = nn.MSELoss()
        self.cls_crit = nn.CrossEntropyLoss()
        # Weights
        self.obj_loss = 0.1
        self.mse_loss = 2
        self.cls_loss = 1 
    def _not_selected(self, bounding_boxes, indices):
        '''
        Bounding_Boxes: (NUM_ANCHORS, S_Scale, S_Scale, 18)
        indices: (N, 3) 
        '''
        _, S_scale, _, _ = bounding_boxes.shape
        x_coords = indices[:, 0]
        y_coords = indices[:, 1]
        anchor_idx = indices[:, 2]
        selected_vals = [(x_coords[i].item(), y_coords[i].item(), anchor_idx[i].item()) for i in range(x_coords.shape[0])] 
        all_x_coords = [i for i in range(S_scale)] 
        all_y_coords = [i for i in range(S_scale)]
        all_anchor_idx = [i for i in range(self.num_anchors)]
        all_possible_combinations = list(itertools.product(all_x_coords, all_y_coords, all_anchor_idx))
        
        not_selected_combinations = torch.tensor([combo for combo in all_possible_combinations if combo not in selected_vals], device = self.device)
        return not_selected_combinations
        
        
    def _loss(self, bbox_scale, GT_scale):
        '''
        Computes the Loss for a given BBox Scale
        bbox_scale; Tensor(NUM_ANCHORS(5 + C), S_scale, S_scale)
        '''
        _, S_Scale, _ = bbox_scale.shape
        grid, anchor, index, GT_delta, GT_bbox= GT_scale
        if index == None:
            return 0
        # Select the GT Anchor
        selected_anchor = self.Yconfig._lookup_anchors(anchor, index)
        # split out the bbox_scale into (3, 5 + C) 
        bbox_scale = bbox_scale.view(self.num_anchors, 5 + self.num_obj_classes, S_Scale, S_Scale);
        bbox_scale = bbox_scale.transpose(1, 2).transpose(2, 3)
        # Select BBoxes
        selected_bboxes = self.Yconfig._lookup_anchors(bbox_scale, index)
        not_selected_bboxes_inds = self._not_selected(bbox_scale, index)
        not_selected_bboxes= self.Yconfig._lookup_anchors(bbox_scale, not_selected_bboxes_inds)
        # Selected Bounding Box Losses
        selected_N, selected_C = selected_bboxes.shape
        not_selected_N, _ = not_selected_bboxes.shape
        
        # Extract CLS, OBJ, and Deltas
        class_selected = selected_bboxes[:, selected_C - self.num_obj_classes:]
        bbox_selected = selected_bboxes[:, :selected_C - self.num_obj_classes] # (cx, cy, w, h) shifts
        deltas_selected = bbox_selected[:, :-1]
        obj_selected = bbox_selected[:, -1]
        
        # Extract CLS and Deltas
        deltas_GT = GT_delta[:, :-1]
        class_GT = GT_delta[:, -1]
        
        obj_loss_selected = self.obj_crit(obj_selected, torch.ones_like(obj_selected, device = obj_selected.device)) * self.obj_loss
        cls_loss_selected = self.cls_crit(class_selected, class_GT.to(torch.long)) * self.cls_loss
        delta_loss_selected = self.MSELoss(deltas_selected, deltas_GT) * self.mse_loss
        selected_loss = obj_loss_selected + cls_loss_selected + delta_loss_selected
        
        # Extract OBJ from not selected
        obj_not_selected = not_selected_bboxes[:, 4]
        obj_loss_not = self.obj_crit(obj_not_selected, torch.zeros_like(obj_not_selected, device = obj_not_selected.device)) * self.obj_loss
        not_selected_loss= obj_loss_not
        # Add Losses
        loss = selected_loss + not_selected_loss
        return loss
        
    def forward(self, bbox_small, bbox_medium, bbox_large, GT_bboxes):
        '''
        bbox_small: Tensor(anchors(5 + C), S_small, S_small)
        bbox_medium: Tensor(anchors(5 + C), S_medium, S_medium)
        bbox_large: Tensor(anchors(5 + C), S_large, S_large)
        GT_bboxes: Tensor(N, 5)
        
        Note: This operation cannot be batched.
        '''
        N, _, _ = bbox_small.shape
        # Compute Grids, and Necessary fucntions
        GT_large, GT_medium, GT_small = self.Yconfig.forward_GT(self.grid_sizes, self.anchor_sizes, GT_bboxes)
        # Compute Loss small
        loss_small = self._loss(bbox_small, GT_small)
        # Compute Loss medium
        loss_medium = self._loss(bbox_medium, GT_medium)
        # Compute Loss Large
        loss_large = self._loss(bbox_large, GT_large)
        # Add Losses(This will be super large to backprop through)
        total_loss = (loss_small + loss_medium + loss_large) / N
        return total_loss

In [32]:
class YOLOv3Inference(nn.Module):
    '''
    Helper Class For Inference with YOLO
    '''
    def __init__(self, input_size, anchor_boxes, grid_sizes, num_obj_classes, device, obj_thresh = 0.0, iou_thresh = 0.5):
        super().__init__()
        self.device = device
        self.obj_thresh = obj_thresh
        self.iou_thresh = iou_thresh
        self.input_size = input_size
        self.anchor_boxes = anchor_boxes 
        self.grid_sizes = grid_sizes
        self.num_obj_classes = num_obj_classes
        
        self.num_anchor_boxes = len(self.anchor_boxes)
        self.num_grid_sizes = len(self.grid_sizes)
        
        self.Yconfig = YOLOv3Config(self.input_size, self.num_anchor_boxes, self.num_grid_sizes, self.device)
    def process_size(self, anchor_size, bbox_shape):
        '''
        bbox_shape: Tensor(B, ANCHOR_BOXES(5 + C), S_size, S_size)
        anchor_sizes: Tensor(ANCHOR_BOXES, S_size, S_size, 4)
        '''
        B, C, S_size, _ = bbox_shape.shape
        bbox_shape = bbox_shape.view(B, self.num_anchor_boxes, 5 + self.num_obj_classes, S_size, S_size)
        # Extract Out Classes
        classes = F.softmax(bbox_shape[:, :, 5:, :, :]) # (B, NUM_ANCHORS, self.num_obj_classes, S_size, S_size)
        # Argmax over classes
        _, classes_ind = torch.max(classes, dim = 2) # (B, NUM_ANCHORS, S_size, S_size)
        
        bboxes = bbox_shape[:, :, :5, :, :] # (B, NUM_ANCHORS, 5, S_size, S_size)
        
        deltas = bboxes[:, :, :-1, :, :] # (B, NUM_ANCHORS, 4, S_size, S_size)
       
        # Extract Out (cx, cy, w, h) shifts 
        cx_delta = deltas[:, :, 0, :, :]
        cy_delta = deltas[:, :, 1, :, :]
        w_delta = deltas[:, :, 2, :, :]
        h_delta = deltas[:, :, 3, :, :]
        
        obj_scores = torch.sigmoid(bboxes[:, :, -1, :, :]) # (B, NUM_ANCHORS, S_size, S_size)
        
        threshold = obj_scores >= self.obj_thresh # (B, NUM_ANCHORS, S_size, S_size)
        kept_sorted_bboxes = []
        for b in range(B):
            threshold_batch = threshold[b] # (NUM_ANCHORS, S_size, S_size)
            cx_batch = cx_delta[b]
            cy_batch = cy_delta[b]
            w_batch = w_delta[b]
            h_batch = h_delta[b]
            classes_batch = classes_ind[b]
            obj_scores_batch = obj_scores[b] 
            anchor_selected = anchor_size[threshold_batch, :] # (N, 4)
            # Inflate anchors
            anchor_selected = self.Yconfig._convert_bbox(anchor_selected) # (N, 5) 
            anchor_selected = anchor_selected.to(self.device)
            kept_cx = cx_batch[threshold_batch]
            kept_cy = cy_batch[threshold_batch]
            kept_w = w_batch[threshold_batch]
            kept_h = h_batch[threshold_batch]
            kept_deltas = torch.stack([kept_cx, kept_cy, kept_w, kept_h], dim = -1) # (N, 4)
            
            kept_bbox = self.Yconfig.apply_GT_deltas(anchor_selected, kept_deltas)[:, :-1] # (N, 4)
            kept_classes = classes_batch[threshold_batch]
            kept_obj_scores = obj_scores_batch[threshold_batch]
            
            # NMS Threshold Current Bounding Boxes
            kept_idx = torchvision.ops.batched_nms(kept_bbox, kept_obj_scores, kept_classes, iou_threshold = self.iou_thresh)
            kept_bbox = kept_bbox[kept_idx]
            kept_classes = kept_classes[kept_idx].unsqueeze(-1)
            kept_obj_scores = kept_obj_scores[kept_idx].unsqueeze(-1)
            kept_bbox = torch.cat([kept_bbox, kept_obj_scores, kept_classes], dim = -1) # (N, 6)
            kept_sorted_bboxes += [kept_bbox]
        return torch.stack(kept_sorted_bboxes)

    def forward(self, bbox_small, bbox_medium, bbox_large):
        '''
        grid_small: Tensor(B, NUM_ANCHORS(5 + C), S_small, S_small)
        grid_medium: Tensor(B, NUM_ANCHORS(5 + C), S_medium, S_medium)
        grid_large: Tensor(B, NUM_ANCHORS(5 + C), S_large, S_large)
        '''
        B, C, _, S_small = bbox_small.shape
        _, _, _, S_medium = bbox_medium.shape
        _, _, _, S_large = bbox_large.shape
        grids, anchors = self.Yconfig(self.grid_sizes, self.anchor_boxes)
        grid_small, grid_medium, grid_large = grids
        anchor_large, anchor_medium, anchor_small = anchors
        # anchor_small: Tensor(NUM_ANCHORS,S_small, S_small, 4), anchor_medium: Tensor(NUM_ANCHORS, S_medium, S_medium, 4), anchor_large: Tensor(NUM_ANCHORS, S_large, S_large, 4)
        # grid_small: Tensor(S_small, S_small, 4), grid_medium: Tenso(S_medium, S_medium, 4), grid_large: Tensor(S_large, S_large, 4)
        # Load onto Device
        grid_small = grid_small.to(self.device)
        grid_medium = grid_medium.to(self.device)
        grid_large = grid_large.to(self.device)
        
        anchor_large = anchor_large.to(self.device)
        anchor_medium = anchor_medium.to(self.device)
        anchor_small = anchor_small.to(self.device) 
        
        small_selected = self.process_size(anchor_small, bbox_small) # (B, selected_small, 6)
        medium_selected = self.process_size(anchor_medium, bbox_medium) # (B, selected_medium, 6)
        large_selected = self.process_size(anchor_large, bbox_large) # (B, selected_large, 6)
        # NMS Threshold Bounding Boxes
        kept_bboxes = []
        for b in range(B):
            small_bboxes = small_selected[b] # (selected_small, 6)
            medium_bboxes = medium_selected[b] # (selected_medium, 6)
            large_bboxes = large_selected[b] # (selected_large, 6)
            # Concatenate
            bboxes = torch.cat([small_bboxes, medium_bboxes, large_bboxes], dim = 0) # (s + m + l, 6)
            # Extract Class and OBJ
            bbox = bboxes[:, :4]
            obj = bboxes[:, 4]
            classes = bboxes[:, -1] 
            # NMS Thresh 
            kept_bboxes_ind = torchvision.ops.batched_nms(bbox, obj, classes, iou_threshold = self.iou_thresh)
            kept_bboxes += [bboxes[kept_bboxes_ind]]
        return kept_bboxes
            
        

In [33]:
class YOLOSolver(nn.Module):
    def __init__(self, input_size, anchor_boxes, grid_sizes, num_obj_classes, device, file_path = None):
        super().__init__()
        self.file_path = file_path
        self.input_size = input_size
        self.anchor_boxes = anchor_boxes
        self.grid_sizes = grid_sizes
        self.num_obj_classes = num_obj_classes
        self.device = device
        self.num_anchors = len(anchor_boxes)
        self.num_grids = len(self.grid_sizes)
        
        self.Yconfig = YOLOv3Config(self.input_size, self.num_anchors, self.num_grids, self.device)
        self.lossConfig = YOLOv3LossConfig(self.input_size, self.anchor_boxes, self.grid_sizes, self.num_obj_classes, self.device)
        self.YInference = YOLOv3Inference(self.input_size, self.anchor_boxes, self.grid_sizes, self.num_obj_classes, self.device)
        
        self.model = YOLOQT(self.num_obj_classes, self.num_anchors, self.device, file_path = self.file_path)
        self.optim = optim.Adam(self.model.parameters(), lr = 3e-4, weight_decay = 1e-4)
        self.lr_decay = optim.lr_scheduler.StepLR(self.optim, 5, 0.95)
        self.lr_decay2 = optim.lr_scheduler.CosineAnnealingLR(self.optim, 5, eta_min = 1e-7)
    def forward(self, x):
        '''
        Runs a Test Time Run through the Object Detector.
        '''
        self.eval()
        with torch.no_grad():
            grid_small, grid_medium, grid_large = self.model(self.Yconfig.convert_image(x))
            # Extract Bounding Boxes out and Threshold their values
            selected_bounding_boxes = self.YInference(grid_small, grid_medium, grid_large)
            return selected_bounding_boxes
    def compute_loss(self, images, bboxes):
        bbox_small, bbox_medium, bbox_large = self.model(self.Yconfig.convert_image(images))
        B, _, _, _ = images.shape
        t_loss = torch.zeros((1), device = self.device)
        for b in range(B):
            one_small = bbox_small[b]
            one_medium = bbox_medium[b]
            one_large = bbox_large[b]
            bbox = bboxes[b].to(self.device)
            loss = self.lossConfig(one_small, one_medium, one_large, bbox)
            t_loss = t_loss + loss
        del bbox_small
        del bbox_medium
        del bbox_large
        torch.cuda.empty_cache()
        return t_loss
    def MaP(self, predicted_bboxes, bboxes):
        '''
        predicted_bboxes: (N_prime, 6)
        bboxes: (N, 5), both in (cx, cy, w, h, cls) format.
        '''
        # Computes the Mean Average Precision between predicted and GT bounding boxes
        Ground_Truths = {}
        GT_deltas = bboxes[:, :-1].cpu().numpy().tolist()
        GT_classes = bboxes[:, -1].cpu().numpy().tolist()
        Ground_Truths['boxes'] = GT_deltas
        Ground_Truths['labels'] = GT_classes
        
        
        pred = {}
        pred_deltas = predicted_bboxes[:, :-2].cpu().numpy().tolist()
        pred_obj_scores = predicted_bboxes[:, -2].cpu().numpy().tolist()
        pred_classes = predicted_bboxes[:, -1].cpu().numpy().tolist()
        
        pred['boxes'] =  pred_deltas
        pred['scores'] = pred_obj_scores
        pred['labels'] = pred_classes
        
        MaP = calculate_map(Ground_Truths, pred, 0.4) # VOC MaP @ 0.4 IOU
        return MaP
    def Accuracy(self, images, bboxes):
        B, _, _, _ = images.shape
        selected_bboxes = self.forward(self.Yconfig.convert_image(images))
        MAP = 0.0
        for idx in range(B):
            predicted_bounding_boxes= selected_bboxes[idx] # (N', 6)
            bbox = bboxes[idx].to(self.device) # (N, 5)
            Mean_Average_Precision = self.MaP(predicted_bounding_boxes, bbox)
            MAP += Mean_Average_Precision
        return MAP
    def validation_run(self, valloader):
        self.eval()
        with torch.no_grad():
            ValMAP = 0
            count = 0
            loss = 0
            for images, bboxes in tqdm.tqdm(valloader):
                B, _, _, _ = images.shape
                images = images.to(self.device)
                ValMAP += self.Accuracy(images.clone(), bboxes)
                loss += self.compute_loss(images, bboxes).item() 
                count += 1
            ValMAP /= count
            loss /= count
        return ValMAP, loss
            
    def training_loop(self, trainloader, valloader, NUM_EPOCHS, display_every = 64):
        '''
        Trains the model.
        '''
        liveloss = livelossplot.PlotLosses()
        best_val_acc = 0
        best_val_loss = 999
        torch.cuda.empty_cache();
        for EPOCH in range(NUM_EPOCHS):
            self.train();
            logs = {}
            total_loss = 0.0
            count = 0
            for images, bboxes in trainloader:
                self.optim.zero_grad();
                B, _, _, _ = images.shape
                images = images.to(self.device)
                loss = self.compute_loss(images, bboxes) / B
                print(f"STEP: {count}, L: {round(loss.item(), 3)}")
                # Get Grid Cells
                loss.backward();
                self.optim.step();
                count += 1
                total_loss += loss.item();
                del images
                del bboxes
                torch.cuda.empty_cache();
                if count == display_every:
                    break
            logs['loss'] = total_loss / count
            print(f"EPOCH: {EPOCH}, total_loss: {total_loss / count}")
            # Validation Run Through:
            self.lr_decay.step()
            self.lr_decay2.step()
            '''
            valMAP, val_loss = self.validation_run(valloader)
            logs['val_loss'] = val_loss
            logs['accuracy'] = valMAP
            '''
            torch.cuda.empty_cache()
            liveloss.update(logs)
            liveloss.send()
            '''
            # Save Highest Performing Model
            if logs['val_loss'] <= best_val_loss:
                best_val_loss = logs['val_loss']
                torch.save(self.state_dict(), "./BestLoss.pth")
            if logs['accuracy'] >= best_val_acc:
                best_val_acc = logs['accuracy']
                torch.save(self.state_dict(), './BestAcc.pth')
            print(f"E: {EPOCH}, L: {round(logs['loss'], 3)}, VL: {round(logs['val_loss'], 3)}, VA: {round(logs['accuracy'], 3)}")
            '''

In [34]:
%%capture
yoloSolver = YOLOSolver(IMAGE_SIZE, ANCHOR_BOXES, GRID_SIZES, NUM_OBJ_CLASSES, device, file_path = train_state_dict_path)
yoloSolver.to(device)

In [None]:
yoloSolver.training_loop(TrainOBJDataloader, ValOBJDataloader, 10, display_every = 48)

  "See the documentation of nn.Upsample for details.".format(mode))


STEP: 0, L: 216.429
STEP: 1, L: 157.321
STEP: 2, L: 179.391
STEP: 3, L: 171.029
STEP: 4, L: 156.688
STEP: 5, L: 172.274
STEP: 6, L: 153.758
STEP: 7, L: 206.983


In [None]:
# Save Various Trained Parameters
torch.save(yoloSolver.state_dict(), "./FinalModel.pth")
torch.save(ANCHOR_BOXES, "./AnchorBoxes.pth")