In [1]:
####

In [None]:
! pip install selective_search

In [3]:
import torch
from torch import nn
import selective_search
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from torchvision import transforms
from torchvision.utils import make_grid
import torchvision.transforms.functional as F
from tqdm.notebook import tqdm
from torchvision.ops import roi_pool

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
def show_tensor_images(image_tensor, num_images=2, size=(3 , 512 , 512)):
    image_shifted = image_tensor
    image_unflat = image_shifted.detach().cpu().view(-1, *size)
    image_grid = make_grid(image_unflat[:num_images], nrow=5)
    plt.imshow(image_grid.permute(1, 2, 0).squeeze())
    plt.show()

In [6]:
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):

    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)

In [7]:

class Conv(nn.Module):
    def __init__(self ,
                 in_channels , 
                 out_channels , 
                 kernel_size = (3 , 3) , 
                 stride = (1 , 1) , 
                 padding = 1 , 
                 use_norm = True , 
                 use_activation = True , 
                 use_pool = False):
        super(Conv , self).__init__()

        self.conv1 = nn.Conv2d(in_channels ,
                               out_channels ,
                               kernel_size , 
                               stride , 
                               padding)
        self.use_norm = use_norm
        self.use_activation = use_activation
        self.use_pool = use_pool

        if self.use_norm:
            self.norm = nn.BatchNorm2d(out_channels)
        if self.use_activation:
            self.activation = nn.ReLU()
        if self.use_pool:
            self.maxpool = nn.MaxPool2d(kernel_size = (2 , 2) , stride = (2 , 2))
    
    def forward(self , x):
        x = self.conv1(x)
        if self.use_norm:
            x = self.norm(x)
        if self.use_activation:
            x = self.activation(x)
        if self.use_pool:
            x = self.maxpool(x)
        return x

In [8]:
class Linear(nn.Module):
    def __init__(self ,  
                 in_channels , 
                 out_channels , 
                 use_norm = True , 
                 use_activation = True):
        super(Linear , self).__init__()

        self.linear1 = nn.Linear(in_channels , 
                                 out_channels)
        self.use_norm = use_norm
        self.use_activation = use_activation

        if self.use_norm:
            self.norm = nn.BatchNorm1d(out_channels)
        if self.use_activation:
            self.activation = nn.ReLU()

    def forward(self , x):
        x = self.linear1(x)
        if self.use_norm:
            x = self.norm(x)
        if self.use_activation:
            x = self.activation(x)
        return x

In [9]:
config = [
          # [out_channels , kernel_size , stride , paddin]
          [64 , 3 , 1 , 1] , 
          [128 , 3 , 1 , 1] , 
          "M" , 
          [128 , 3 , 1 , 1] , 
          [256 , 3 , 1 , 1] , 
          "M" , 
          [256 , 3 , 1 , 1] , 
          [512 , 3 , 1 , 1] , 
          "M" , 
          [512 , 3 , 1 , 1] ,
          [512 , 3 , 1 , 1] ,
          [512 , 3 , 1 , 1] ,
          "M" , 
          [512 , 3 , 1 , 1] ,
          [512 , 3 , 1 , 1] ,
          [512 , 3 , 1 , 1] ,
          #"M" , 
          #4096 
]

In [10]:
class VGG(nn.Module):
    def __init__(self , 
                 in_channels = 3 , 
                 config = config):
        super(VGG , self).__init__()

        self.layers = nn.ModuleList()

        for layer in config:
            if isinstance(layer , list):
                out_channels , kernel_size , stride , padding = layer
                self.layers.append(Conv(
                    in_channels , 
                    out_channels , 
                    kernel_size , 
                    stride , 
                    padding
                ))
                in_channels = out_channels
            elif isinstance(layer , str):
                self.layers.append(nn.MaxPool2d(kernel_size = (2 , 2) , stride = (2 , 2)))
            else:
                if layer == 4096:
                    self.layers.append(nn.Flatten())
                    self.layers.append(Linear(25088 , 4096))
                elif layer == 1000:
                    self.layers.append(Linear(4096 , 1000 , use_activation = False))
                    self.layers.append(nn.Softmax())
    def forward(self , x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
x = torch.randn(2 , 3 , 244 , 244).to(device)
vgg = VGG().to(device)
z = vgg(x)
z.shape

In [12]:
class Dataset_(torch.utils.data.Dataset):
    def __init__(
        self,
        csv_file,
        img_dir,
        label_dir,
        C=20,
        transform=None,
    ):

        super(Dataset_ , self).__init__()

        self.df = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.C = C
        self.transform = transform
    def __len__(self):
        return len(self.df)

    def __getitem__(self , idx):
        label_path = os.path.join(self.label_dir , self.df.iloc[idx , 1])
        boxes = []

        with open(label_path) as f:
            for label in f.readlines():
                class_label , x , y , width , height = [
                    float(x) if float(x) != int(float(x)) else int(x)
                    for x in label.replace("\n", "").split()
                ]
                boxes.append([ x , y , width , height , class_label])

        boxes = torch.tensor(boxes) 

        image_path = os.path.join(self.img_dir , self.df.iloc[idx , 0])
        image = np.asarray(plt.imread(image_path))
        image = torch.from_numpy(image).permute(2 , 0 , 1)

        if self.transform:
            image = self.transform(image)
        regions = self._get_regions(image)
        regions = torch.tensor(regions)
        #print(regions.shape)
        zeros = torch.randn((regions.shape[0] , 1))
        #print(regions.shape , zeros.shape)
        regions = torch.cat([regions , zeros] , dim=-1)
        return image , boxes , regions

    def _get_regions(self , image , topN = 1):
        image = image.permute(1 , 2 , 0)
        boxes = selective_search.selective_search(image, mode='fast')
        boxes_filter = selective_search.box_filter(boxes, min_size=20, topN=topN)
        return boxes_filter

In [13]:
transform = transforms.Compose([
                                transforms.ToPILImage() , 
                                transforms.Resize((512 , 512)) , 
                                transforms.ToTensor()
])
dataset = Dataset_(
    csv_file = "/content/drive/MyDrive/Yolo_Dataset/train.csv" , 
    img_dir = "/content/drive/MyDrive/Yolo_Dataset/images/" , 
    label_dir = "/content/drive/MyDrive/Yolo_Dataset/labels/" , 
    transform = transform
)
dataloader = torch.utils.data.DataLoader(dataset , batch_size=1 , shuffle=True)

In [None]:
for img , box , region in dataloader:
    show_tensor_images(img)
    print(box.shape)
    print(region.shape)
    break

In [53]:
class FastRCNN(nn.Module):
    def __init__(self):
        super(FastRCNN , self).__init__()
        hidden_dim = 4096
        num_classes = 20
        bboxes = 4
        self.vgg = VGG()
        self.flatten = nn.Flatten()
        self.linear1 = Linear(7*7*512 , hidden_dim , use_norm=False)
        self.linear2 = Linear(hidden_dim , hidden_dim , use_norm=False)

        self.linear_classes = Linear(hidden_dim , num_classes , use_activation=False , use_norm=False)
        self.linear_boxes = Linear(hidden_dim , bboxes , use_activation = False , use_norm=False)
        self.softmax = nn.Softmax()

    def forward(self , x , regions):
        classes_ = []
        bbox_ = []
        x = self.vgg(x)
        #print(regions.shape)
        for region in regions:
            x = roi_pool(x , region.unsqueeze(0) , (7 , 7))
            x = self.flatten(x)
            x = self.linear1(x)
            x = self.linear2(x)
            classes = self.softmax(self.linear_classes(x))
            bbox = self.linear_boxes(x)
            classes_.append(classes)
            bbox_.append(bbox)
        classes_ = torch.stack(classes_)
        bbox = torch.stack(bbox_)
        return classes , bbox

In [None]:
fastrcnn = FastRCNN().to(device)
for img , target , regions in dataloader:
    img , target , regions = img.to(device) , target.to(device) , regions.to(device)
    classes , bbox = fastrcnn(img , regions.squeeze(0))
    break
print(classes.shape , bbox.shape)

In [55]:
recon_criterion = nn.L1Loss()
cross_entropy_criterion = nn.CrossEntropyLoss()
lambda_recon = 200
betas = (0.5 , 0.999)


n_epochs = 200
in_channels = 3
out_channels = 3
display_steps = 500
lr = 0.0002
target_shape = 512

In [56]:
fastrcnn = FastRCNN().to(device)
opt = torch.optim.Adam(fastrcnn.parameters() , lr=lr , betas = betas)

In [57]:
def train():
    mean_rcnn_loss = 0
    cur_step = 0
    for epoch in range(n_epochs):
        for img , target , regions in dataloader:
            img , target , regions = img.to(device) , target.to(device) , regions.to(device)
            opt.zero_grad()
            classes , bbox = fastrcnn(img , regions.squeeze(0))
            class_loss = 0
            for i in range(target.shape[1]):    
                class_loss += cross_entropy_criterion(classes , target[: , i , -1].long())
            bbox_loss = recon_criterion(bbox , target[... , :4])
            loss = (class_loss + bbox_loss)/2
            loss.backward()
            opt.step()

            mean_rcnn_loss += loss.item() / display_steps

            if cur_step % display_steps == 0:
                print(f'Epoch {epoch} , Step {cur_step} , Mean RCNN Loss {mean_rcnn_loss}')
            cur_step +=1
        mean_rcnn_loss = 0

In [None]:
train()