<a href="https://colab.research.google.com/github/Regina-Arthur/YOLOv1_From_Scratch/blob/main/YOLOv1_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Building a YOLOv1 model from scratch by Regina Arthur


###This will be done without the convolutional layers being pretrained on imagenet

##Import the necessary libraries


In [1]:
#Let's import the necessary libraries to build YOLOv1
#with pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms.v2 as v2
import matplotlib.pyplot as plt
import numpy as np
from torchvision.datasets import VOCDetection
from torch.utils.data import Dataset, DataLoader

##Connect To a GPU if available

In [2]:
#Let's check if cuda is available and connect to it.
#If it is not available, use cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

##Load the Semantic Boundaries Dataset


### Create a folder to store the Semantic Boundaries Dataset

In [1]:
# #Let's import the necessary libraries for creating a folder in google drive
# from google.colab import drive
# import os

# #Let's mount Google Drive
# drive.mount('/content/drive')

# def determinedirectory(directory):
#   #Let's define folder path in Google Drive
#   location = directory.upper()
#   root ='/content/drive/MyDrive/_Dataset/'
#   new_root = root + location
#   #Let's check if the folder exists, if not, let's create it
#   if not os.path.exists(new_root):
#       os.makedirs(new_root)
#       print(f"Created new folder: {new_root}")
#   else:
#       print(f"Folder already exists: {new_root}")
#   return new_root


Mounted at /content/drive


In [2]:
# ! pip install -q kaggle

In [None]:
import os

# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

# !kaggle datasets download -d vijayabhaskar96/pascal-voc-2007-and-2012


# output_dir = determinedirectory("PascalVOC2007_and_2012")


# !unzip "pascal-voc-2007-and-2012.zip" -d "$output_dir"

##Neural Network Architecture


###I am following the documentation on the MODULE class to build my neural network. All neural network model are to inherit from the nn.Module class.

####Conv2d
####class torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)

####class torch.nn.LeakyReLU(negative_slope=0.01, inplace=False)

####MaxPool2d
####class torch.nn.MaxPool2d(kernel_size, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False)[source]

####Linear
####class torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)[source]

####Dropout2d
####class torch.nn.Dropout2d(p=0.5, inplace=False)[source]



In [3]:
class YOLOv1(nn.Module):
  def __init__(self, S=7, B=2, C=20):
    super().__init__()
    self.conv1 = nn.Conv2d(in_channels = 3,out_channels = 64, kernel_size = 7,stride = 2, padding = 3)
    self.conv2 = nn.Conv2d(in_channels = 64,out_channels = 192, kernel_size = 3, stride = 1, padding = 1)
    self.conv3 = nn.Conv2d(in_channels = 192,out_channels = 128, kernel_size = 1,stride = 1, padding = 0)
    self.conv4 = nn.Conv2d(in_channels = 128,out_channels = 256, kernel_size = 3, stride = 1, padding = 1)
    self.conv5 = nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 1, stride = 1, padding = 0)
    self.conv6 = nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
    self.conv7 = nn.Conv2d(in_channels = 512, out_channels = 256, kernel_size = 1,stride = 1, padding = 0)
    self.conv8 = nn.Conv2d(in_channels = 256,out_channels = 512, kernel_size = 3, stride = 1, padding = 1)
    self.conv9 = nn.Conv2d(in_channels = 512,out_channels = 256, kernel_size = 1,stride = 1, padding = 0)
    self.conv10 = nn.Conv2d(in_channels = 256,out_channels = 512, kernel_size = 3,stride = 1, padding =1)
    self.conv11 = nn.Conv2d(in_channels = 512,out_channels = 256, kernel_size = 1,stride = 1, padding = 0)
    self.conv12 = nn.Conv2d(in_channels = 256,out_channels = 512, kernel_size = 3,stride = 1, padding = 1)
    self.conv13 = nn.Conv2d(in_channels = 512,out_channels = 256, kernel_size = 1,stride = 1, padding = 0)
    self.conv14 = nn.Conv2d(in_channels = 256,out_channels = 512, kernel_size = 3,stride = 1, padding = 1)
    self.conv15 = nn.Conv2d(in_channels = 512,out_channels = 512, kernel_size = 1,stride = 1, padding = 0)
    self.conv16 = nn.Conv2d(in_channels = 512,out_channels = 1024, kernel_size = 3,stride = 1, padding = 1)
    self.conv17 = nn.Conv2d(in_channels = 1024,out_channels = 512, kernel_size = 1,stride = 1, padding = 0)
    self.conv18 = nn.Conv2d(in_channels = 512,out_channels = 1024, kernel_size = 3,stride = 1, padding = 1)
    self.conv19 = nn.Conv2d(in_channels = 1024,out_channels = 512, kernel_size = 1,stride = 1, padding = 0)
    self.conv20 = nn.Conv2d(in_channels = 512,out_channels = 1024, kernel_size = 3,stride = 1, padding = 1)
    self.conv21 = nn.Conv2d(in_channels = 1024,out_channels = 1024, kernel_size = 3,stride = 1, padding = 1)
    self.conv22 = nn.Conv2d(in_channels = 1024,out_channels = 1024, kernel_size = 3,stride = 2, padding = 1)
    self.conv23 = nn.Conv2d(in_channels = 1024,out_channels = 1024, kernel_size = 3,stride = 1, padding = 1)
    self.conv24 = nn.Conv2d(in_channels = 1024,out_channels = 1024, kernel_size = 3,stride = 1, padding = 1)

    #Pooling and Activation
    self.MaxPooling2d = nn.MaxPool2d(kernel_size = 2, stride = 2)
    self.LeakyReLU = nn.LeakyReLU(negative_slope = 0.1)

    #Flattening and Fully Connected Layer
    self.Flatten = nn.Flatten()
    self.Linear = nn.Linear(in_features = 50176 , out_features = 4096)
    self.Dropout = nn.Dropout2d(0.5)
    self.Linear2 = nn.Linear(in_features =4096 , out_features = S * S *( B * 5 + 20))

  def forward(self, x):
    x = self.MaxPooling2d(self.LeakyReLU(self.conv1(x)))
    x = self.MaxPooling2d(self.LeakyReLU(self.conv2(x)))
    x = self.LeakyReLU(self.conv3(x))
    x = self.LeakyReLU(self.conv4(x))
    x = self.LeakyReLU(self.conv5(x))
    x = self.MaxPooling2d(self.LeakyReLU(self.conv6(x)))
    x = self.LeakyReLU(self.conv7(x))
    x = self.LeakyReLU(self.conv8(x))
    x = self.LeakyReLU(self.conv9(x))
    x = self.LeakyReLU(self.conv10(x))
    x = self.LeakyReLU(self.conv11(x))
    x = self.LeakyReLU(self.conv12(x))
    x = self.LeakyReLU(self.conv13(x))
    x = self.LeakyReLU(self.conv14(x))
    x = self.LeakyReLU(self.conv15(x))
    x = self.MaxPooling2d(self.LeakyReLU(self.conv16(x)))
    x = self.LeakyReLU(self.conv17(x))
    x = self.LeakyReLU(self.conv18(x))
    x = self.LeakyReLU(self.conv19(x))
    x = self.LeakyReLU(self.conv20(x))
    x = self.LeakyReLU(self.conv21(x))
    x = self.LeakyReLU(self.conv22(x))
    x = self.LeakyReLU(self.conv23(x))
    x = self.LeakyReLU(self.conv24(x))
    x = self.LeakyReLU(self.Linear(self.Flatten(x)))
    x = self.Dropout(x)
    x = self.Linear2(x)
    return x



## Architecture Test
This just makes sure the architecture works as intended


In [None]:
# model = YOLOv1()
# model = model.to(device)

In [None]:
# dummy_input = torch.randn(1, 3, 448, 448).to(device)

# with torch.no_grad():
#     output = model(dummy_input)

# print("Output shape:", output.shape)

Output shape: torch.Size([1, 1470])




In [None]:
# total_params = sum(p.numel() for p in model.parameters())
# trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

# print(f"Total params: {total_params:,}")
# print(f"Trainable params: {trainable_params:,}")

Total params: 271,703,550
Trainable params: 271,703,550


In [None]:
class YOLOv1Loss(nn.Module):
  def __init__(self, S=7, B=2, C=20, λ_coord=5, λ_noobj=0.5):
    super().__init__()

##Load the Pascal Visual Object Class Dataset

In [4]:
torch.manual_seed(0)
model = YOLOv1()
model = model.to(device)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
#Let's reshape the images
transforms = v2.Compose([
    v2.ColorJitter(brightness=1.5, contrast=1.5, saturation=1.5),
    v2.RandomHorizontalFlip(p=0.2),
    v2.RandomVerticalFlip(p=0.2),
    v2.Resize((448,448)),
    v2.ToDtype(torch.float32, scale=True),
    ])

TrainData = torchvision.datasets.VOCDetection(
    root = '/content/drive/MyDrive/_Dataset/PASCALVOC2007_AND_2012',
    year = '2007',
    image_set = 'train',
    download = False,
    transform = transforms,
    )


Traindataloader = DataLoader(TrainData,
                             batch_size=64,
                             shuffle=True,
                             num_workers=2,
                             )

###AdamW
class torch.optim.AdamW(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False, *, maximize=False, foreach=None, capturable=False, differentiable=False, fused=None)


In [15]:
def learning_rate(epoch, epochs):
  first_stop = 0.6 * epochs
  second_stop = 0.8 * epochs
  if epoch <= first_stop:
    return 0.1
  elif epoch <= second_stop:
    return 0.01
  else:
    return 0.001


optimizer = torch.optim.AdamW(model.parameters(),
                              lr= 0.1,
                              betas=(0.9,0.99),
                              weight_decay= 0.0005,
                              )
loss = YOLOv1Loss()
num_epochs = 10

In [19]:
for epoch in range(num_epochs):
    model.train()
    optimizer.param_groups[0]['lr'] = learning_rate(epoch, num_epochs)
    for images, targets in Traindataloader:
        images, targets = images.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = loss(outputs, targets)
        loss.backward()
        optimizer.step()

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (1470) must match the size of tensor b (5) at non-singleton dimension 1

In [None]:
print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {loss.item()}"), torch.save(model.state_dict(), "checkpoint.pth")

##Model loss