In [1]:
import math, os, random, cv2, numpy, torch
import torch.nn as nn

In [3]:
#fix param for YOLOv8n
depth_multiple = 1/3
width_multiple = 1/4
ratio = 2

Backbone


In [4]:
class Conv(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, groups= 1):
    super().__init__()
    self.conv = nn.Conv2d(in_channels= in_channels, out_channels= out_channels, kernel_size= kernel_size, stride= stride, padding= padding, bias= False, groups= groups)
    self.bn = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.03)
    self.act = nn.SiLU(inplace=True)

  def forward(self, x):
    print(torch.mean(x))
    return self.act(self.bn(self.conv(x)))

In [5]:
class Bottleneck(nn.Module):
  def __init__(self, channels, shortcut= True):
    super().__init__()
    self.conv1 = Conv(in_channels= channels, out_channels= channels, kernel_size= 3, stride= 1, padding= 1)
    self.conv2 = Conv(in_channels= channels, out_channels= channels, kernel_size= 3, stride= 1, padding= 1)
    self.shortcut = shortcut

  def forward(self, x):
    x_in = x #[bs, c, w, h]
    x = self.conv2(self.conv1(x)) #[bs, c, w, h]
    if self.shortcut:
      return x_in + x #[bs, c, w, h]
    return x #[bs, c, w, h]

In [7]:
class C2f(nn.Module):
  def __init__(self, in_channels, out_channels, num_bottlenecks, shortcut= True) -> None:
    super().__init__()
    self.hidden_channels= out_channels//2
    self.num_bottlenecks= num_bottlenecks
    self.conv1 = Conv(in_channels= in_channels, out_channels= out_channels, kernel_size= 1, stride= 1, padding= 0)
    self.bottlenecks = nn.ModuleList([Bottleneck(self.hidden_channels, shortcut= True) for _ in range(self.num_bottlenecks)])
    self.conv2 = Conv(in_channels= (num_bottlenecks+2)*out_channels//2, out_channels= out_channels, kernel_size= 1, stride= 1, padding= 0)

  def forward(self, x):
    x = self.conv1(x) ##[bs, c_out, w, h]

    x1, x2 = x[:, :x.shape[1]//2, :, :], x[:, x.shape[1]//2:, :, :] #[bs, c_out, w, h]

    outputs = [x1, x2] #[bs, c_out, w, h]

    for i in range(self.num_bottlenecks):
      x1 = self.bottlenecks[i](x1) #[bs, 0.5c_out, w, h]
      outputs.append(x1)

    outputs = torch.cat(outputs, dim= 1) #[bs, 0.5*(n + 2)c_out, w, h]

    return self.conv2(outputs)

In [8]:
c2f=C2f(in_channels=64,out_channels=128,num_bottlenecks=2)
print(f"{sum(p.numel() for p in c2f.parameters())/1e6} million parameters")

dummy_input=torch.rand((1,64,244,244))
dummy_input=c2f(dummy_input)
print("Output shape: ", dummy_input.shape)

0.18944 million parameters
tensor(0.4998)
tensor(0.2005, grad_fn=<MeanBackward0>)
tensor(0.2048, grad_fn=<MeanBackward0>)
tensor(0.4051, grad_fn=<MeanBackward0>)
tensor(0.2056, grad_fn=<MeanBackward0>)
tensor(0.3540, grad_fn=<MeanBackward0>)
Output shape:  torch.Size([1, 128, 244, 244])


In [9]:
class SPPF(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size= 5): #fix kernel_size of maxpool = 5 for YOLOv8
    super().__init__()
    hidden_channels = in_channels//2
    self.conv1 = Conv(in_channels= in_channels, out_channels= hidden_channels, kernel_size= 1, stride= 1, padding= 0)
    self.max_pool = nn.MaxPool2d(kernel_size= kernel_size, stride= 1, padding= kernel_size//2, dilation= 1, ceil_mode= False)
    self.conv2 = Conv(in_channels= 4 * hidden_channels, out_channels= out_channels, kernel_size= 1, stride= 1, padding= 0)

  def forward(self, x):
    x = self.conv1(x) #[bs, 0.5 * c_in, w, h]

    x1 = self.max_pool(x)
    x2 = self.max_pool(x1)
    x3 = self.max_pool(x2)

    y = torch.cat([x, x1, x2, x3], dim = 1) #[bs, 4 * hidden_channels, w, h]

    out = self.conv2(y) #[bs, c_out, w, h]

    return out

In [10]:
sppf=SPPF(in_channels=128,out_channels=512)
print(f"{sum(p.numel() for p in sppf.parameters())/1e6} million parameters")

dummy_input=sppf(dummy_input)
print("Output shape: ", dummy_input.shape)

0.140416 million parameters
tensor(0.2053, grad_fn=<MeanBackward0>)
tensor(1.7023, grad_fn=<MeanBackward0>)
Output shape:  torch.Size([1, 512, 244, 244])


In [11]:
class Backbone(nn.Module):
  def __init__(self, in_channels= 3, shortcut= True):
    super().__init__()
    d = depth_multiple
    w = width_multiple
    r = ratio

    self.conv_0 = Conv(in_channels= in_channels, out_channels= int(64 * w) , kernel_size= 3, stride= 2, padding= 1)
    self.conv_1 = Conv(in_channels= int(64 * w), out_channels= int(128 * w), kernel_size= 3, stride= 2, padding= 1)
    self.c2f_2 = C2f(in_channels= int(128 * w), out_channels= int(128 * w), num_bottlenecks= int(3 * d), shortcut= shortcut)
    self.conv_3 = Conv(in_channels= int(128 * w), out_channels= int(256 * w), kernel_size= 3, stride= 2, padding= 1)
    self.c2f_4 = C2f(in_channels= int(256 * w), out_channels= int(256 * w), num_bottlenecks= int(6 * d), shortcut= shortcut)
    self.conv_5 = Conv(in_channels= int(256 * w), out_channels= int(512 * w), kernel_size= 3, stride= 2, padding= 1)
    self.c2f_6 = C2f(in_channels= int(512 * w), out_channels= int(512 * w), num_bottlenecks= int(6 * d), shortcut= shortcut)
    self.conv_7 = Conv(in_channels= int(512 * w), out_channels= int(512 * w * r), kernel_size= 3, stride= 2, padding= 1)
    self.c2f_8 = C2f(in_channels= int(512 * w * r), out_channels= int(512 * w * r), num_bottlenecks= int(3 * d), shortcut= shortcut)
    self.sppf_9 = SPPF(in_channels= int(512 * w * r), out_channels= int(512 * w * r))

  def forward(self, x):
    x = self.conv_0(x)
    x = self.conv_1(x)
    x = self.c2f_2(x)
    x = self.conv_3(x)

    out_1 = self.c2f_4(x) #output1
    x = self.conv_5(out_1)

    out_2 = self.c2f_6(x) #output2
    x = self.conv_7(out_2)
    x = self.c2f_8(x)

    x = self.sppf_9(x)
    out_3 = x #output3

    return out_1, out_2, out_3

In [12]:
backbone_n=Backbone()
print(f"{sum(p.numel() for p in backbone_n.parameters())/1e6} million parameters")


1.272656 million parameters


In [13]:
x=torch.rand((1,3,640,640))
out1,out2,out3=backbone_n(x)
print(out1.shape)
print(out2.shape)
print(out3.shape)

tensor(0.5001)
tensor(0.2012, grad_fn=<MeanBackward0>)
tensor(0.2046, grad_fn=<MeanBackward0>)
tensor(0.2033, grad_fn=<MeanBackward0>)
tensor(0.2040, grad_fn=<MeanBackward0>)
tensor(0.2712, grad_fn=<MeanBackward0>)
tensor(0.2017, grad_fn=<MeanBackward0>)
tensor(0.2036, grad_fn=<MeanBackward0>)
tensor(0.2025, grad_fn=<MeanBackward0>)
tensor(0.2036, grad_fn=<MeanBackward0>)
tensor(0.4061, grad_fn=<MeanBackward0>)
tensor(0.2045, grad_fn=<MeanBackward0>)
tensor(0.3552, grad_fn=<MeanBackward0>)
tensor(0.2025, grad_fn=<MeanBackward0>)
tensor(0.2034, grad_fn=<MeanBackward0>)
tensor(0.2023, grad_fn=<MeanBackward0>)
tensor(0.2035, grad_fn=<MeanBackward0>)
tensor(0.4057, grad_fn=<MeanBackward0>)
tensor(0.2044, grad_fn=<MeanBackward0>)
tensor(0.3548, grad_fn=<MeanBackward0>)
tensor(0.2031, grad_fn=<MeanBackward0>)
tensor(0.2032, grad_fn=<MeanBackward0>)
tensor(0.2021, grad_fn=<MeanBackward0>)
tensor(0.2023, grad_fn=<MeanBackward0>)
tensor(0.2696, grad_fn=<MeanBackward0>)
tensor(0.2022, grad_fn=<M

Neck

In [14]:
class Upsample(nn.Module):
    def __init__(self, scale_factor= 2, mode= 'nearest'):
        super().__init__()
        self.scale_factor= scale_factor
        self.mode= mode

    def forward(self,x):
        return nn.functional.interpolate(x,scale_factor=self.scale_factor,mode=self.mode)


In [15]:
class Neck(nn.Module):
    def __init__(self):
        super().__init__()
        d = depth_multiple
        w = width_multiple
        r = ratio

        self.up_sample = Upsample()
        self.c2f_10 = C2f(in_channels= int(512 * w *(1+r)), out_channels= int(512 * w), num_bottlenecks=int(3 * d), shortcut= False)
        self.c2f_11 = C2f(in_channels= int(768 * w), out_channels=int(256 * w), num_bottlenecks = int(3 * d), shortcut= False)
        self.conv_12=Conv(in_channels=int(256 * w), out_channels= int(256*w), kernel_size= 3, stride= 2, padding= 1)
        self.c2f_13=C2f(in_channels= int(768 * w), out_channels= int(512 * w), num_bottlenecks= int(3 * d), shortcut= False)
        self.conv_14=Conv(in_channels= int(512 * w), out_channels= int(512 * w), kernel_size=3, stride= 2, padding= 1)
        self.c2f_15=C2f(in_channels= int(512 *w * (1+r)), out_channels= int(512 * w *r), num_bottlenecks= int(3 * d), shortcut= False)



    def forward(self, x_res_1, x_res_2, x):
        res_1=x

        x = self.up_sample(x)
        x = torch.cat([x,x_res_2],dim=1)

        res_2 = self.c2f_10(x)

        x = self.up_sample(res_2)
        x = torch.cat([x, x_res_1],dim=1)

        out_1 = self.c2f_11(x)

        x = self.conv_12(out_1)

        x = torch.cat([x, res_2],dim=1)
        out_2 = self.c2f_13(x)

        x = self.conv_14(out_2)

        x = torch.cat([x, res_1],dim=1)
        out_3 = self.c2f_15(x)

        return out_1,out_2,out_3


In [16]:
neck=Neck()
print(f"{sum(p.numel() for p in neck.parameters())/1e6} million parameters")

x=torch.rand((1,3,640,640))
out_1,out_2,out_3=neck(out1,out2,out3)
print(out_1.shape)
print(out_2.shape)
print(out_3.shape)

0.98688 million parameters
tensor(0.2052, grad_fn=<MeanBackward0>)
tensor(0.2040, grad_fn=<MeanBackward0>)
tensor(0.2034, grad_fn=<MeanBackward0>)
tensor(0.2717, grad_fn=<MeanBackward0>)
tensor(0.2033, grad_fn=<MeanBackward0>)
tensor(0.2021, grad_fn=<MeanBackward0>)
tensor(0.2016, grad_fn=<MeanBackward0>)
tensor(0.2692, grad_fn=<MeanBackward0>)
tensor(0.2001, grad_fn=<MeanBackward0>)
tensor(0.2020, grad_fn=<MeanBackward0>)
tensor(0.1998, grad_fn=<MeanBackward0>)
tensor(0.2008, grad_fn=<MeanBackward0>)
tensor(0.2670, grad_fn=<MeanBackward0>)
tensor(0.1993, grad_fn=<MeanBackward0>)
tensor(0.2037, grad_fn=<MeanBackward0>)
tensor(0.2024, grad_fn=<MeanBackward0>)
tensor(0.2011, grad_fn=<MeanBackward0>)
tensor(0.2693, grad_fn=<MeanBackward0>)
torch.Size([1, 64, 80, 80])
torch.Size([1, 128, 40, 40])
torch.Size([1, 256, 20, 20])


Head


In [17]:
# DFL
class DFL(nn.Module):
    def __init__(self,ch=16):
        super().__init__()

        self.ch=ch

        self.conv=nn.Conv2d(in_channels = ch, out_channels = 1, kernel_size = 1, bias = False).requires_grad_(False)

        x = torch.arange(ch,dtype = torch.float).view(1,ch,1,1)
        self.conv.weight.data[:] = torch.nn.Parameter(x)

    def forward(self,x):
        b,c,a=x.shape
        x=x.view(b,4,self.ch,a).transpose(1,2)

        x=x.softmax(1)                          # [b,ch,4,a]
        x=self.conv(x)                          # [b,1,4,a]
        return x.view(b,4,a)                    # [b,4,a]

In [18]:
dummy_input=torch.rand((1,64,128))
dfl=DFL()
print(f"{sum(p.numel() for p in dfl.parameters())} parameters")

dummy_output=dfl(dummy_input)
print(dummy_output.shape)

print(dfl)

16 parameters
torch.Size([1, 4, 128])
DFL(
  (conv): Conv2d(16, 1, kernel_size=(1, 1), stride=(1, 1), bias=False)
)


In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Head
class Head(nn.Module):
    def __init__(self, ch=16, num_classes=8, img_size=640, feature_map_sizes=[80,40,20]):
        super().__init__()
        self.ch = ch
        self.coordinates = self.ch * 4
        self.nc = num_classes
        self.no = self.coordinates + self.nc

        self.stride = torch.tensor([img_size / s for s in feature_map_sizes], dtype=torch.float32)

        # depth, width, ratio 
        d = depth_multiple
        w = width_multiple
        r = ratio

        # Box branch
        self.box = nn.ModuleList([
            nn.Sequential(
                Conv(int(256*w), self.coordinates),
                Conv(self.coordinates, self.coordinates),
                nn.Conv2d(self.coordinates, self.coordinates, 1)
            ),
            nn.Sequential(
                Conv(int(512*w), self.coordinates),
                Conv(self.coordinates, self.coordinates),
                nn.Conv2d(self.coordinates, self.coordinates, 1)
            ),
            nn.Sequential(
                Conv(int(512*w*r), self.coordinates),
                Conv(self.coordinates, self.coordinates),
                nn.Conv2d(self.coordinates, self.coordinates, 1)
            )
        ])

        # Class branch
        self.cls = nn.ModuleList([
            nn.Sequential(
                Conv(int(256*w), self.nc),
                Conv(self.nc, self.nc),
                nn.Conv2d(self.nc, self.nc, 1)
            ),
            nn.Sequential(
                Conv(int(512*w), self.nc),
                Conv(self.nc, self.nc),
                nn.Conv2d(self.nc, self.nc, 1)
            ),
            nn.Sequential(
                Conv(int(512*w*r), self.nc),
                Conv(self.nc, self.nc),
                nn.Conv2d(self.nc, self.nc, 1)
            )
        ])

        self.dfl = DFL()

    def forward(self, x):
        # x = list of 3 feature maps from Neck
        for i in range(len(self.box)):
            box = self.box[i](x[i])    # [bs, 4*ch, w, h]
            cls = self.cls[i](x[i])    # [bs, nc, w, h]
            x[i] = torch.cat((box, cls), dim=1)  # [bs, no, w, h]

        if self.training:
            return x  # train: -> list feature map [3, bs, no, w, h]

        # Inference: make anchors
        anchors, strides = (i.transpose(0,1) for i in self.make_anchors(x, self.stride))

        # flatten feature maps
        x = torch.cat([i.view(x[0].shape[0], self.no, -1) for i in x], dim=2)  # [bs, no, sum(hw)]

        # split box & cls
        box, cls = x.split(split_size=(4*self.ch, self.nc), dim=1)

        # decode box with DFL
        a, b = self.dfl(box).chunk(2, 1)  # top-left & bottom-right
        a = anchors.unsqueeze(0) - a
        b = anchors.unsqueeze(0) + b
        box = torch.cat(tensors=((a+b)/2, b-a), dim=1)

        return torch.cat((box * strides, cls.sigmoid()), dim=1)
        # return torch.cat((box * strides, cls), dim=1)

    def make_anchors(self, x, strides, offset=0.5):
        anchor_tensor, stride_tensor = [], []
        dtype, device = x[0].dtype, x[0].device
        for i, stride in enumerate(strides):
            _, _, h, w = x[i].shape
            sx = torch.arange(end=w, device=device, dtype=dtype) + offset
            sy = torch.arange(end=h, device=device, dtype=dtype) + offset
            sy, sx = torch.meshgrid(sy, sx)
            anchor_tensor.append(torch.stack((sx, sy), -1).view(-1, 2))
            stride_tensor.append(torch.full((h*w,1), stride, dtype=dtype, device=device))
        return torch.cat(anchor_tensor), torch.cat(stride_tensor)


In [21]:
detect=Head()
print(f"{sum(p.numel() for p in detect.parameters())/1e6} million parameters")

# out_1,out_2,out_3 are output of the neck
output=detect([out_1,out_2,out_3])
print(output[0].shape)
print(output[1].shape)
print(output[2].shape)

print(detect)

0.4162 million parameters
tensor(0.2001, grad_fn=<MeanBackward0>)
tensor(0.1991, grad_fn=<MeanBackward0>)
tensor(0.2001, grad_fn=<MeanBackward0>)
tensor(0.1991, grad_fn=<MeanBackward0>)
tensor(0.1993, grad_fn=<MeanBackward0>)
tensor(0.1992, grad_fn=<MeanBackward0>)
tensor(0.1993, grad_fn=<MeanBackward0>)
tensor(0.2000, grad_fn=<MeanBackward0>)
tensor(0.2009, grad_fn=<MeanBackward0>)
tensor(0.1993, grad_fn=<MeanBackward0>)
tensor(0.2009, grad_fn=<MeanBackward0>)
tensor(0.1993, grad_fn=<MeanBackward0>)
torch.Size([1, 72, 80, 80])
torch.Size([1, 72, 40, 40])
torch.Size([1, 72, 20, 20])
Head(
  (box): ModuleList(
    (0): Sequential(
      (0): Conv(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   

In [22]:
class MyYolov8n(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = Backbone()
        self.neck = Neck()
        self.head = Head()

    def forward(self, x):
        x = self.backbone(x)            
        x = self.neck(x[0], x[1], x[2])   
        return self.head(list(x))

In [23]:
model = MyYolov8n()
print(f"{sum(p.numel() for p in model.parameters())/1e6} million parameters")
print(model)

2.675736 million parameters
MyYolov8n(
  (backbone): Backbone(
    (conv_0): Conv(
      (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (conv_1): Conv(
      (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (c2f_2): C2f(
      (conv1): Conv(
        (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (bottlenecks): ModuleList(
        (0): Bottleneck(
          (conv1): Conv(
            (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (bn):