# **Import Dependencies**

In [2]:
import torch
import torch.nn as nn

# **DarkNet-53**

In [9]:
# Define the characteristics of a convolutional block in our CNN
class ConvUnit(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
    super(ConvUnit, self).__init__()
    # create convolutional layer, bias = False becasue we're using batch normalisation and that has its own bias 
    self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
    # batch normalisation - stabalise training
    self.bn = nn.BatchNorm2d(out_channels)
    # leaky relu - this should avoid vanishing gradients (alongside our skip connections) and also address the dying neuron problem 
    # 0.1 defines the negative slope in the ReLU function that will allow some negative weights not to evaluate to zero
    self.leaky_relu = nn.LeakyReLU(0.1)


  def forward(self, x):
    '''
    pass x (our input) through conv -> batch_norm - > ReLU
    '''
    return self.leaky_relu(self.bn(self.conv(x)))

In [10]:
# Define the characteristics of a residual unit in our CNN
class ResidualUnit(nn.Module):
  def __init__(self, in_channels):
    super(ResidualUnit, self).__init__()
    # first convolution with n=output_channels 1*1 kernels yielding img*img*out_channels feature map
    # (downsample while retaining features)
    self.conv1 = ConvUnit(in_channels, in_channels//2, kernel_size=1, stride=1, padding=0)
    # second convolution (extract features)
    self.conv2 = ConvUnit(in_channels//2, in_channels, kernel_size=3, stride=1, padding=1)

  def forward(self, x):
    '''
    pass x through these convolutional layers and return original input + processed output
    '''
    # skip connection
    return x + self.conv2(self.conv1(x))

In [11]:
# Outline the CNN itself

class DarkNet53(nn.Module):
  def __init_(self):
    super(DarkNet53, self).__init__()
    # First convolutional layers (1) + batch norm and ReLU for each conv unit 
    self.conv1 = ConvUnit(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)

    # Downsample (reduce dimensionality) (1 layer)
    self.conv2 = ConvUnit(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)

    # First residual unit + skip connection (2 convolutional layers)
    self.res1 = self.make_layer(ResidualUnit(64, 1))

    # process repeated to capture more complex features
    self.conv3 = ConvUnit(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1) # 1 layer
    self.res2 = self.make_layer(ResidualUnit(128, 2)) # 4 layers
    self.conv4 = ConvUnit(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1) # 1 layer
    self.res3 = self.make_layer(ResidualUnit(256, 8)) # 16 layers
    self.conv5 = ConvUnit(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1) # 1 layer
    self.res4 = self.make_layer(ResidualUnit(512, 8)) # 16 layers
    self.conv6 = ConvUnit(in_channels=512, out_channels=1024, kernel_size=3, stride=2, padding=1) # 1 layer
    self.res5 = self.make_layer(ResidualUnit(1024, 4)) # 8 layers

    # apply gloval average pooling to final feature map as per DarkNet-53 architecture (1 layer)
    # final downsample before we feed the output feature map to the YOLO Layers (1*1 convolutional layers to predict the bounding boxes)
    # yolo v1 used fully connected layers but in this v3 implementation 1*1 convolutional layers are used to directly predict boxes i.e we don't flatten the feature map
    self.global_avg_pool = nn.AdaptiveAvgPool2d((1,1))

    # for a total of 53 layers ... what a surprise 

  def make_layer(self, unit, out_channels, num_units):
    layers=[]
    for _ in range(num_units):
      layers.append(unit(out_channels))
    return nn.Sequential(*layers)

  def forward(self, x):
    '''
    feed input through the layers 
    '''
    x = self.conv1(x)
    x = self.conv2(x)
    x = self.res1(x)
    x = self.conv3(x)
    x = self.res2(x)
    x = self.conv4(x)
    x = self.res3(x)
    x = self.conv5(x)
    x = self.res4(x)
    x = self.conv6(x)
    x = self.res5(x)

    x = self.global_avg_pool(x)
    return x  # our feature map

#**"YOLO Layer"**

After our Darknet53 CNN extracts the features, we use the 'yolo,' layer to complete object detection and classification in a single forward pass.

In [None]:
class YOLOLayer(nn.Module):
  def __init__(self, in_channels, num_classes, anchors, input_dimension):
    super(YOLOLayer, self).__init__()
    self.num_classes = num_classes
    self.num_anchors = len(anchors)
    self.input_dim = input_dimension
    # Output layer
    # convolutional layer convolving the DarkNet output witht a 1*1 kernel yielding c channels where c=(self.num_anchors * (5 + num_classes)
    # our grid is defined as each cell in the final downsampled feature map (after global pooling in darknet)
    # Each cell in the grid corresponds to a region of the original image and each cell is responsible for detecting objects within this region 

    # Each cell has 3 anchors (predefined sizes objects should be approximately at different aspect ratios) YOLO 
    # adjusts these anchors to define bounding boxes rather than drawing boxes from scratch so for each anchor 
    # we want (x, y, w, h) describing how much to offset each aspect of the anchor to surround the image, we want our confidence score that 
    # an object exists and we want class probabilities [x,y,w,h,conf,classes] per anchor, hence c=(self.num_anchors * (5 + num_classes)

    
    self.conv = nn.Conv2d(in_channels, self.num_anchors * (5 + num_classes), kernel_size=1, stride=1, padding=0)

  def forward(self, x):
    batch_size, _, grid_size, _ = x.shape
    
    # This outputs a 4D tensor of shape (batch_size, (anchors*(5+classes), grid_size, grid_size)) - a hypercube of data (not relevant, just like the word 'hypercube')
    # we use .view() to transform to a 5D tensor of shape (batch_size, anchors, (x,y,w,h,conf,2 class probs =7) grid, grid)
    # This way each anchor corresponds to the ReLU activations we are attributing to that anchor and that will be learned by the model 
    prediction = self.conv(x).view(batch_size, self.num_anchors, 5 + self.num_classes, grid_size, grid_size)

    # this changes the order now to shape = (batch, anchors, grid, grid, (x,y,w,h,conf,2 class probs =7))
    # this allows us to cleanly access a given anchor at a given cell and output its downsampled ReLU activations from the CNN 
    # these activations initially have nothing to do with x,y,w,h,conf,class probs, they are just a downsampling from the output of the CNN
    # but the model will learn to optimise them essentially outputting what we want where we want through the loss function
    prediction = prediction.premute(0, 1, 3, 4, 2).contiguous() 

    # Now we have to take these predictions and convert the raw ReLU activations into our bounded boxes 
    # I've written the formulae here : 
    #      x = (sigmoid(predicted x offset) + grid cell x) / grid size) 
    #      y = (sigmoid(predicted y offset) + grid cell y) / grid size) 
    #      w = anchor w * exp(predicted w offset) / input_width
    #      h = anchor h * exp(predicted h offset) / input_height
    #      conf = sigmoid(predicted)
    #      class probs = softmax/sigmoid(predicted) depending on how many 

    # the grid is sort of imaginary its just the size of the downsampled tensor minus the channels obviously and we can map it to the original image

    x = prediction[..., 0] # get x for every anchor for every cell for every image in batch 
    y = prediction[..., 1] # get y for every anchor for every cell for every image in batch 
    w = prediction[..., 2] # get w for every anchor for every cell for every image in batch 
    h = prediction[..., 3] # get h for every anchor for every cell for every image in batch 
    
    conf = torch.sigmoid(prediction[..., 4]) # sigmoid of every conf prediction in every anchor in every cell in batch
    class_probs = torch.sigmoid(prediction[..., 5:], dim=-1) # we only have two classes so I went for sigmoid 

    # grid_x and grid_y are tensors that store every x and every y coordinate respectively of the grid cells
    # meshgrid() creates a grid of grid_size * grid_size and assigns coordinates to grid_x and grid_y
    grid_x, grid_y = torch.meshgrid(torch.arrange(grid_size), torch.arrange(grid_size), indexing='ij')
    # these should be tensor.int64 by default but they will be involved in floating point calculations in a matter of 3 lines so 
    # best to convert them to floats so python doesn't start nagging us 
    grid_x, grid_y = grid_x.float(), grid_y.float()

    # now the actual bounded box calculations according to fomulae outlined above 
    bx = (torch.sigmoid(x) + grid_x) / grid_size # grid_X and Y ensure bounding box mapped to right position and 
    by = (torch.sigmoid(y) + grid_y) / grid_size # sigmoid helps the bounding box centre stay in the cell 

    # I feel like this looks more complicated than it is - its just the formula as above but 
    # we use [:, 0/1] to index anchors from the anchors outlined in the YOLOv3 class and .view(1,-1,1,1)
    # to match the shape of w which has batch, grid and anchors so in order to match we transform it to shape 
    # (1, num_anchors, 1, 1) - [[[[num_anchors]]]] * exp(w) 
    # exp() is used becasue YOLO doesnt predict width it predicts a logarithmic offset of the predefined anchor so exp makes it stay positive 
    # and scale by a meaningful amount 
    # lastly dividing it by the input_dimension of the image gives us a decimal that we can use to adjust the boxes in a scale 
    # invariant way  
    bw = self.anchors[:, 0].view(1, -1, 1, 1) * torch.exp(w) / self.input_dim
    bh = self.anchors[:, 1].view(1, -1, 1, 1) * torch.exp(h) / self.input_dim

    # stack all predictions into a single tensor 
    boxes = torch.stack((bx, by, bw, bh, conf), dim=-1)

    return boxes, class_probs

# **YOLOv3**

In [13]:
class YOLOv3(nn.Module):
  def __init__(self, num_classes):
    super(YOLOv3, self).__init__()
    self.base = DarkNet53()
    self.yolo1 = YOLOLayer(1024, num_classes, anchors=[(116, 90), (156, 198), (373, 326)])
    self.yolo2 = YOLOLayer(512, num_classes, anchors=[(30, 61), (62, 45), (59, 119)])
    self.yolo3 = YOLOLayer(256, num_classes, anchors=[(10, 13), (16, 30), (33, 23)])

  def forward(self, x):
    '''
    We pass x - through darknet extracting feature maps at multiple scales 

    Three YOLO Layers are implemented to detect objects of variable size 
    - Layer 1 -> deep map -> large objects 
    - Layer 2 -> mid map -> medium objects 
    - Layer 3 -> shallow map -> small objects 

    We later apply NMS to remove duplicates ( boxes with >= 40% overlap )
    '''
    x = self.base(x)
    return self.yolo1(x), self.yolo2(x), self.yolo3(x) # (Three detection heads)

# **Loss Function**

Three phase, Squared Error Approach

1) Bounding box error - heavy penalty

2) Incorrect object detected in cell

3) Error between prediction and target prediction

In [14]:
class YOLOLoss(nn.Module):
  def __init__(self, num_classes):
    super(YOLOLoss, self).__init__()
    self.num_classes = num_classes
    self.mse = nn.MSELoss()
    self.bce = nn.BCEWithLogitsLoss()
    self.ce = nn.CrossEntropyLoss()

  def forward(self, predictions, targets):
    # 1 : loss for bounding box (x,y coords , width, height)
    box_loss = self.mse(predictions[..., :4], targets[..., :4])

    # 2 : object confidence
    conf_loss = self.bce(predictions[..., 4], targets[..., 4])

    # 3 : class predictions
    class_loss = self.ce(predictions[..., 5:], targets[..., 5:].argmax(-1))

    total_loss = box_loss + conf_loss + class_loss
    return total_loss

In [15]:
# model = YOLOv3(num_classes=2)

Non Maximum Suppression

We're allowing 3 anchors per cell as per YOLOv3 so there's lots of duplicates to remove

In [None]:
def nonMaxSuppression(predictions, conf_threshold=0.5, iou_threshold=0.4):
    '''
    '''

    pass 
    