A detailed walkthrough of the YOLOv1 architecture and its PyTorch implementation from scratch

Importing Packages to run the code

In [None]:
# import torch 
import torch
import torch.nn as nn 


# define the parameters 

In [None]:
S = 7
B = 2
C = 20

I initialized above are the default values given in the paper, in which S represents the number of grid cells along the horizontal and vertical axes, B denotes the number of bounding boxes generated by each cell, and C is the number of classes available in the dataset. Since we use S=7 and B=2, our YOLOv1 will produce7×7×2=98 bounding boxes in total for each image.

The Building Block

In [None]:
# Codeblock 2
class ConvBlock(nn.Module):
    def __init__(self, 
                 in_channels, 
                 out_channels, 
                 kernel_size, 
                 stride, 
                 padding, 
                 maxpool_flag=False):
        super().__init__()
        self.maxpool_flag = maxpool_flag
        
        self.conv = nn.Conv2d(in_channels=in_channels,       #(1)
                              out_channels=out_channels, 
                              kernel_size=kernel_size, 
                              stride=stride, 
                              padding=padding)
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.1)   #(2)
        
        if self.maxpool_flag:
            self.maxpool = nn.MaxPool2d(kernel_size=2,       #(3)
                                        stride=2)
            
    def forward(self, x):
        print(f'original\t: {x.size()}')

        x = self.conv(x)
        print(f'after conv\t: {x.size()}')
        
        x = self.leaky_relu(x)
        print(f'after leaky relu: {x.size()}')
        
        if self.maxpool_flag:
            x = self.maxpool(x)
            print(f'after maxpool\t: {x.size()}')
        
        return x

In [None]:
# Codeblock 3
convblock = ConvBlock(in_channels=3,       #(1)
                      out_channels=64,     #(2)
                      kernel_size=7,       #(3)
                      stride=2,            #(4)
                      padding=3,           #(5)
                      maxpool_flag=True)   #(6)
x = torch.randn(1, 3, 448, 448)            #(7)
out = convblock(x)

In modern architectures, we normally use the Conv-BN-ReLU structure, but at the time YOLOv1 was created, it seems like batch normalization layer was not quite popular just yet, as it came out only several months before YOLOv1.

In [None]:
# Codeblock 3
convblock = ConvBlock(in_channels=3,       #(1)
                      out_channels=64,     #(2)
                      kernel_size=7,       #(3)
                      stride=2,            #(4)
                      padding=3,           #(5)
                      maxpool_flag=True)   #(6)
x = torch.randn(1, 3, 448, 448)            #(7)
out = convblock(x)

Afterwards, we can simply generate a tensor of random values with the dimension of 1×3×448×448 (#(7)) which simulates a batch of a single RGB image of size 448×448 and then pass it through the network. You can see in the resulting output below that our convolution layer successfully increased the number of channels to 64 and halved the spatial dimension to 224×224. The halving was done once again all the way to 112×112 thanks to the maxpooling layer.

The Backbone

In [13]:
# Codeblock 4a
class Backbone(nn.Module):
    def __init__(self):
        super().__init__()
        # in_channels, out_channels, kernel_size, stride, padding
        self.stage0 = ConvBlock(3, 64, 7, 2, 3, maxpool_flag=True)      #(1)
        self.stage1 = ConvBlock(64, 192, 3, 1, 1, maxpool_flag=True)    #(2)
        
        self.stage2 = nn.ModuleList([
            ConvBlock(192, 128, 1, 1, 0), 
            ConvBlock(128, 256, 3, 1, 1), 
            ConvBlock(256, 256, 1, 1, 0),
            ConvBlock(256, 512, 3, 1, 1, maxpool_flag=True)      #(3)
        ])
        
        
        self.stage3 = nn.ModuleList([])
        for _ in range(4):
            self.stage3.append(ConvBlock(512, 256, 1, 1, 0))
            self.stage3.append(ConvBlock(256, 512, 3, 1, 1))
            
        self.stage3.append(ConvBlock(512, 512, 1, 1, 0))
        self.stage3.append(ConvBlock(512, 1024, 3, 1, 1, maxpool_flag=True))  #(4)
        
        
        self.stage4 = nn.ModuleList([])
        for _ in range(2):
            self.stage4.append(ConvBlock(1024, 512, 1, 1, 0))
            self.stage4.append(ConvBlock(512, 1024, 3, 1, 1))
        
        self.stage4.append(ConvBlock(1024, 1024, 3, 1, 1))
        self.stage4.append(ConvBlock(1024, 1024, 3, 2, 1))    #(5)
        
        
        self.stage5 = nn.ModuleList([])
        self.stage5.append(ConvBlock(1024, 1024, 3, 1, 1))
        self.stage5.append(ConvBlock(1024, 1024, 3, 1, 1))
        
# Codeblock 4b
    def forward(self, x):
        print(f'original\t: {x.size()}\n')
        
        x = self.stage0(x)
        print(f'after stage0\t: {x.size()}\n')
        
        x = self.stage1(x)
        print(f'after stage1\t: {x.size()}\n')
        
        for i in range(len(self.stage2)):
            x = self.stage2[i](x)
            print(f'after stage2 #{i}\t: {x.size()}')
        
        print()
        for i in range(len(self.stage3)):
            x = self.stage3[i](x)
            print(f'after stage3 #{i}\t: {x.size()}')
        
        print()
        for i in range(len(self.stage4)):
            x = self.stage4[i](x)
            print(f'after stage4 #{i}\t: {x.size()}')
        
        print()
        for i in range(len(self.stage5)):
            x = self.stage5[i](x)
            print(f'after stage5 #{i}\t: {x.size()}')
            
        return x

What we do in the above codeblock is to instantiate ConvBlock instances according to the architecture given in the paper. There are several things I want to emphasize here. First, the term stage I use in the code is not explicitly mentioned in the paper.

let’s verify if our implementation is correct by running the following testing code.

In [14]:
# Codeblock 5
backbone = Backbone()
x = torch.randn(1, 3, 448, 448)
out = backbone(x)

original	: torch.Size([1, 3, 448, 448])

original	: torch.Size([1, 3, 448, 448])
after conv	: torch.Size([1, 64, 224, 224])
after leaky relu: torch.Size([1, 64, 224, 224])
after maxpool	: torch.Size([1, 64, 112, 112])
after stage0	: torch.Size([1, 64, 112, 112])

original	: torch.Size([1, 64, 112, 112])
after conv	: torch.Size([1, 192, 112, 112])
after leaky relu: torch.Size([1, 192, 112, 112])
after maxpool	: torch.Size([1, 192, 56, 56])
after stage1	: torch.Size([1, 192, 56, 56])

original	: torch.Size([1, 192, 56, 56])
after conv	: torch.Size([1, 128, 56, 56])
after leaky relu: torch.Size([1, 128, 56, 56])
after stage2 #0	: torch.Size([1, 128, 56, 56])
original	: torch.Size([1, 128, 56, 56])
after conv	: torch.Size([1, 256, 56, 56])
after leaky relu: torch.Size([1, 256, 56, 56])
after stage2 #1	: torch.Size([1, 256, 56, 56])
original	: torch.Size([1, 256, 56, 56])
after conv	: torch.Size([1, 256, 56, 56])
after leaky relu: torch.Size([1, 256, 56, 56])
after stage2 #2	: torch.Size([1

The Fully-Connected Layers

In [15]:
# Codeblock 6
class FullyConnected(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.linear0 = nn.Linear(in_features=1024*7*7, out_features=4096)   #(1)
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.1)                  #(2)
        self.dropout = nn.Dropout(p=0.5)                                    #(3)
        self.linear1 = nn.Linear(in_features=4096, out_features=(C+B*5)*S*S)#(4)
    
    def forward(self, x):
        print(f'original\t: {x.size()}')
        
        x = self.linear0(x)
        print(f'after linear0\t: {x.size()}')
        
        x = self.leaky_relu(x)
        x = self.dropout(x)
        
        x = self.linear1(x)
        print(f'after linear1\t: {x.size()}')
        
        return x

how the tensor transforms as it is processed by the stack of linear layers.

In [16]:
# Codeblock 7
fc = FullyConnected()
x = torch.randn(1, 1024*7*7)
out = fc(x)

original	: torch.Size([1, 50176])
after linear0	: torch.Size([1, 4096])
after linear1	: torch.Size([1, 1470])


We can see in the above output that the fc block takes an input of shape 50176, which is essentially the flattened 1024×7×7 tensor. The linear0 layer works by mapping this input into 4096-dimensional vector, and then the linear1 layer eventually maps it further to 1470. Later in the post-processing stage we need to reshape it to 30×7×7 so that we can take the bounding box and the object classification results easily. Technically speaking, this reshaping process can be done either internally by the model or outside the model. For the sake of simplicity, I decided to leave the output flattened, meaning the reshaping will be handled externally.

Connecting the FC Part to the Backbone

In [17]:
# Codeblock 8
class YOLOv1(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.backbone = Backbone()
        self.fc = FullyConnected()
        
    def forward(self, x):
        x = self.backbone(x)
        x = torch.flatten(x, start_dim=1)    #(1)
        x = self.fc(x)
        
        return x

In order to test our model, we can simply instantiate the YOLOv1 model and pass a dummy tensor that simulates an RGB image of size 448×448 (#(1)). After feeding the tensor into the network (#(2)), I also try to simulate the post-processing step by reshaping the output tensor to 30×7×7 as shown at line #(3).

In [18]:
# Codeblock 9
yolov1 = YOLOv1()
x = torch.randn(1, 3, 448, 448)      #(1)

out = yolov1(x)                      #(2)
out = out.reshape(-1, C+B*5, S, S)   #(3)

original	: torch.Size([1, 3, 448, 448])

original	: torch.Size([1, 3, 448, 448])
after conv	: torch.Size([1, 64, 224, 224])
after leaky relu: torch.Size([1, 64, 224, 224])
after maxpool	: torch.Size([1, 64, 112, 112])
after stage0	: torch.Size([1, 64, 112, 112])

original	: torch.Size([1, 64, 112, 112])
after conv	: torch.Size([1, 192, 112, 112])
after leaky relu: torch.Size([1, 192, 112, 112])
after maxpool	: torch.Size([1, 192, 56, 56])
after stage1	: torch.Size([1, 192, 56, 56])

original	: torch.Size([1, 192, 56, 56])
after conv	: torch.Size([1, 128, 56, 56])
after leaky relu: torch.Size([1, 128, 56, 56])
after stage2 #0	: torch.Size([1, 128, 56, 56])
original	: torch.Size([1, 128, 56, 56])
after conv	: torch.Size([1, 256, 56, 56])
after leaky relu: torch.Size([1, 256, 56, 56])
after stage2 #1	: torch.Size([1, 256, 56, 56])
original	: torch.Size([1, 256, 56, 56])
after conv	: torch.Size([1, 256, 56, 56])
after leaky relu: torch.Size([1, 256, 56, 56])
after stage2 #2	: torch.Size([1