# **Import Dependencies**

In [2]:
import torch
import torch.nn as nn

# **DarkNet-53**

In [9]:
# Define the characteristics of a convolutional block in our CNN
class ConvUnit(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
    super(ConvUnit, self).__init__()
    self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
    self.bn = nn.BatchNorm2d(out_channels)
    self.leaky_relu = nn.LeakyReLU(0.1)

  def forward(self, x):
    return self.leaky_relu(self.bn(self.conv(x)))

In [10]:
# Define the characteristics of a residual unit in our CNN
class ResidualUnit(nn.Module):
  def __init__(self, in_channels):
    super(ResidualUnit, self).__init__()
    self.conv1 = ConvUnit(in_channels, in_channels//2, kernel_size=1, stride=1, padding=0)
    self.conv2 = ConvUnit(in_channels//2, in_channels, kernel_size=3, stride=1, padding=1)

  def forward(self, x):
    # skip connection
    return x + self.conv2(self.conv1(x))

In [11]:
# Outline the CNN itself

class DarkNet53(nn.Module):
  def __init_(self):
    super(DarkNet53, self).__init__()
    # First convolutional layer
    self.conv1 = ConvUnit(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)

    # Downsample (reduce dimensionality)
    self.conv2 = ConvUnit(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)

    # First residual unit + skip connection
    self.res1 = self.make_layer(ResidualUnit(64, 1))

    self.conv3 = ConvUnit(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1)
    self.res2 = self.make_layer(ResidualUnit(128, 2))
    self.conv4 = ConvUnit(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
    self.res3 = self.make_layer(ResidualUnit(256, 8))
    self.conv5 = ConvUnit(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1)
    self.res4 = self.make_layer(ResidualUnit(512, 8))
    self.conv6 = ConvUnit(in_channels=512, out_channels=1024, kernel_size=3, stride=2, padding=1)
    self.res5 = self.make_layer(ResidualUnit(1024, 4))

  def make_layer(self, unit, out_channels, num_units):
    layers=[]
    for _ in range(num_units):
      layers.append(unit(out_channels))
    return nn.Sequential(*layers)

  def forward(self, x):
    x = self.conv1(x)
    x = self.conv2(x)
    x = self.res1(x)
    x = self.conv3(x)
    x = self.res2(x)
    x = self.conv4(x)
    x = self.res3(x)
    x = self.conv5(x)
    x = self.res4(x)
    x = self.conv6(x)
    x = self.res5(x)
    return x  # our feature map

#**"YOLO Layer"**

After our Darknet53 CNN extracts the features, we use the 'yolo,' layer to complete object detection and classification in a single forward pass.

In [12]:
class YOLOLayer(nn.Module):
  def __init__(self, in_channels, num_classes, anchors):
    super(YOLOLayer, self).__init__()
    self.num_classes = num_classes
    self.num_anchors = len(anchors)
    # Output layer
    self.conv = nn.Conv2d(in_channels, self.num_anchors * (5 + num_classes), kernel_size=1, stride=1, padding=0)

  def forward(self, x):
    batch_size, _, grid_size, _ = x.shape
    prediction = self.conv(x).view(batch_size, self.num_anchors, 5 + self.num_classes, grid_size, grid_size)
    prediction = prediction.premute(0, 1, 3, 4, 2).contiguous()
    return prediction

# **YOLOv3**

In [13]:
class YOLOv3(nn.Module):
  def __init__(self, num_classes):
    super(YOLOv3, self).__init__()
    self.base = DarkNet53()
    self.yolo1 = YOLOLayer(1024, num_classes, anchors=[(116, 90), (156, 198), (373, 326)])
    self.yolo2 = YOLOLayer(512, num_classes, anchors=[(30, 61), (62, 45), (59, 119)])
    self.yolo3 = YOLOLayer(256, num_classes, anchors=[(10, 13), (16, 30), (33, 23)])

  def forward(self, x):
    x = self.base(x)
    return self.yolo1(x), self.yolo2(x), self.yolo3(x) # (Three detection heads)

# **Loss Function**

Three phase, Squared Error Approach

1) Bounding box error - heavy penalty

2) Incorrect object detected in cell

3) Error between prediction and target prediction

In [14]:
class YOLOLoss(nn.Module):
  def __init__(self, num_classes):
    super(YOLOLoss, self).__init__()
    self.num_classes = num_classes
    self.mse = nn.MSELoss()
    self.bce = nn.BCEWithLogitsLoss()
    self.ce = nn.CrossEntropyLoss()

  def forward(self, predictions, targets):
    # 1 : loss for bounding box (x,y coords , width, height)
    box_loss = self.mse(predictions[..., :4], targets[..., :4])

    # 2 : object confidence
    conf_loss = self.bce(predictions[..., 4], targets[..., 4])

    # 3 : class predictions
    class_loss = self.ce(predictions[..., 5:], targets[..., 5:].argmax(-1))

    total_loss = box_loss + conf_loss + class_loss
    return total_loss

In [15]:
# model = YOLOv3(num_classes=2)