In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
CONFIG_DIR = "/content/drive/MyDrive/datasets/yoloData/"

In [None]:
!cat /content/drive/MyDrive/datasets/yoloData/yolov3.cfg


In [4]:
import os

def clean_data(lines_):
  result = []
  for line in lines_:
    if len(line) > 0 and line[0] != "#":
      result.append(line.strip())
  return result

def parse_config_file(config_file):
  with open(config_file, "r") as f:
    lines_data = f.read()
    lines = lines_data.split("\n")
    lines = clean_data(lines)

  result = []
  block = {}

  for line in lines:
    if line[0] == "[":
      if "block_name" in block:
        result.append(block)
        block = {}
      block_name = line[1:-1]
      block["block_name"] = block_name

    else:
      key, val = line.split("=")
      block[key.strip()] = val.strip()

  result.append(block)
  return result


config = parse_config_file(os.path.join(CONFIG_DIR, "yolov3.cfg"))
config[0]

{'block_name': 'net',
 'batch': '1',
 'subdivisions': '1',
 'width': '416',
 'height': '416',
 'channels': '3',
 'momentum': '0.9',
 'decay': '0.0005',
 'angle': '0',
 'saturation': '1.5',
 'exposure': '1.5',
 'hue': '.1',
 'learning_rate': '0.001',
 'burn_in': '1000',
 'max_batches': '500200',
 'policy': 'steps',
 'steps': '400000,450000',
 'scales': '.1,.1'}

In [None]:
c_y, c_x = torch.meshgrid(torch.arange(13), torch.arange(13), indexing="ij")
c_x, c_y = c_x.reshape(-1, 1), c_y.reshape(-1, 1)
torch.cat((c_x, c_y), 1).repeat(1, 3).view(-1, 2).unsqueeze(0)[:20]

tensor([[[ 0,  0],
         [ 0,  0],
         [ 0,  0],
         ...,
         [12, 12],
         [12, 12],
         [12, 12]]])

In [5]:
from typing import List, Dict, Tuple


class ShortCut(nn.Module):
  def __init__(self, from_):
    super().__init__()
    self.from_ = int(from_)


class Route(nn.Module):
  def __init__(self, start, end=None):
    super().__init__()
    self.start = int(start)
    if end:
      self.end = int(end)
    else:
      self.end = end


class DetectionModule(nn.Module):
  def __init__(self, anchors: List[Tuple[int]], classes: int,
               ignore_thresh: int):
    super().__init__()
    self.anchors = anchors
    self.classes = int(classes)
    self.ignore_thresh = ignore_thresh


class YoloV3Net(nn.Module):

  def __init__(self, config_: List[Dict[str, str]], device="cpu") -> None:
    super().__init__()
    self.config = config_
    self.device = device
    self.net_config = self.config[0]
    self.modules_ = self.create_arc()

  def create_arc(self):
    modules = nn.ModuleList()
    prev_channels = int(self.net_config["channels"])
    layer_out_channel_size = []

    for i, block in enumerate(self.config[1:]):
      module = nn.Sequential()

      match block["block_name"]:

        case "convolutional":

          out_channels, kernel_size, stride, padding, activation = \
          int(block["filters"]), int(block["size"]), int(block["stride"]), \
          int(block["pad"]), block["activation"]

          padding = 0 if not padding else (kernel_size - 1) // 2

          if block.get("batch_normalize", 0):
            conv = nn.Conv2d(prev_channels, out_channels, kernel_size= kernel_size,
                           stride=stride, padding=padding, bias=False)
            module.add_module(f"conv_{i}", conv)
            module.add_module(f"bn_{i}", nn.BatchNorm2d(out_channels))
            module.add_module(f"activation_leaky_{i}", nn.LeakyReLU())
          else:
            conv = nn.Conv2d(prev_channels, out_channels, kernel_size= kernel_size,
                           stride=stride, padding=padding, bias=True)
            module.add_module(f"conv_{i}", conv)


        case "shortcut":
          module.add_module(f"shortcut_{i}", ShortCut(block["from"]))

        case "upsample":
          module.add_module(f"upsample_{i}",
                            nn.Upsample(scale_factor = int(block["stride"])))


        case "route":
          num_route = block["layers"].split(",")
          if len(num_route) == 1:
            start = int(block["layers"])
            if start > 0:
              start = start - i
            out_channels = layer_out_channel_size[i + start]
            module.add_module(f"route_{i}",
                            Route(start))
          else:
            start, end = block["layers"].split(",")
            start = int(start.strip())
            end = int(end.strip())
            if start > 0:
              start = start - i
            if end > 0:
              end = end - i
            out_channels = layer_out_channel_size[i + start] + \
                                  layer_out_channel_size[i + end]
            module.add_module(f"route_{i}",
                            Route(start, end))

        case "yolo":
          mask = [int(i) for i in block["mask"].split(",")]
          all_anchors = [int(i) for i in block["anchors"].split(",")]
          packed_anchors = [(all_anchors[i], all_anchors[i+1]) for i in
                            range(0, len(all_anchors), 2)]
          anchors = [packed_anchors[i] for i in mask]
          anchors = torch.tensor(anchors).float().to(self.device)
          classes, ignore_thresh = block["classes"], block["ignore_thresh"]
          module.add_module(f"detection_{i}",
                            DetectionModule(anchors, classes, ignore_thresh))


      modules.append(module)
      prev_channels = out_channels
      layer_out_channel_size.append(out_channels)
    return modules

  def forward(self, x):
    cache_output = {}
    count_predictor = 0
    for i, block in enumerate(self.config[1:]):
      match block["block_name"]:

        case "convolutional":
          x = self.modules_[i](x)

        case "shortcut":
          from_ = int(self.modules_[i][0].from_)
          x = cache_output[i-1] + cache_output[i + from_]

        case "route":
          start = int(self.modules_[i][0].start)
          end = self.modules_[i][0].end
          if end:
            x = torch.cat((cache_output[i + start], cache_output[i + end]), 1)
          else:
            x = cache_output[i + start]

        case "upsample":
          x = self.modules_[i](x)

        case "yolo":
          # batch * 255 * 13 * 13 ===> [x, y, w , h, os, ...]
          x = self.predict_detection(x, self.modules_[i][0].classes, self.modules_[i][0].anchors)
          if count_predictor == 0:
            predictions = x
            count_predictor += 1
          else:
            predictions = torch.cat((predictions, x), 1)

      cache_output[i] = x
      # batch * ... * 85
    return predictions

  def predict_detection(self, x, classes, anchors):
    # x = [batch, C, W, H]
    grid_size = x.size(2)
    num_object_per_cell = len(anchors)
    detected_obj_vec_size = 5 + classes

    #  finally ==> [btz, 13 * 13 * 3, 85]

    # 4d --> 3d
    result = x.view(x.size(0), detected_obj_vec_size * num_object_per_cell,
           grid_size * grid_size)
    # 255 * (13 * 13)  ---> (13* 13) * 255
    result = result.transpose(1, 2).contiguous()
    # (13* 13) * 255 ---> (13 * 13 * 3) * 85
    result = result.view(x.size(0), grid_size * grid_size * num_object_per_cell,
                 detected_obj_vec_size)

    c_y, c_x = torch.meshgrid(torch.arange(grid_size), torch.arange(grid_size), indexing="ij")
    c_x, c_y = c_x.reshape(-1, 1).to(self.device), c_y.reshape(-1, 1).to(self.device)
    coordinate_offset = torch.cat((c_x, c_y), 1).repeat(1, 3).view(-1, 2).unsqueeze(0)


    # x, y cordinate
    result[..., 0:2] = F.sigmoid(result[..., 0:2]) + coordinate_offset

    # w, h  b_w = exp(t_w) * p_w
    anchors = anchors.repeat(grid_size * grid_size, 1).unsqueeze(0)
    result[..., 2:4] = torch.exp(result[..., 2:4]) * anchors

    # objectness score
    result[..., 4:5] = torch.sigmoid(result[..., 4:5])

    # prob per class!
    result[..., 5:detected_obj_vec_size] = torch.sigmoid(result[..., 5:detected_obj_vec_size])

    return result


In [None]:
anchors = torch.tensor([[116.,  90.],
        [156., 198.],
        [373., 326.]])
anchors.repeat(3*3, 1).unsqueeze(0)

In [11]:
model = YoloV3Net(config, device="cpu")
x = torch.randn(8, 3, 416, 416)
prediction = model(x)

In [10]:
10647 / (3 * (13 * 13 + 26 * 26 + 52 * 52))

1.0

In [14]:
prediction[..., 5:].size()

torch.Size([8, 10647, 80])

In [None]:
# prediction[..., 4] < threshold
# prediction[..., 5:]

In [None]:
# TODO: non-max supression!
#def iou(box_1, box_2): ---> return iou

In [None]:
import gc

del model
gc.collect()