In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
from PIL import Image

In [None]:
class DDataset(Dataset):
    def __init__(self, path, grid_size, d_per_grid):
        super().__init__()
        self.path = path
        self.grid_size = grid_size
        self.d_per_grid = d_per_grid
        self.images = self.__list_data_path(os.path.join(path, "images"))
        self.labels = self.__list_data_path(os.path.join(path, "labels"))
        self.transform = T.Compose([T.ToTensor(),  # Convert to tensor, pixels scaled to [0, 1]
                                    T.Normalize(mean=[0.5, 0.5, 0.5],  # Normalize
                                    std=[0.5, 0.5, 0.5])
                                    ])

    def __list_data_path(self, path):
        # this is just to get all the file name so that when the dataloader call on dataset,
        # we can read from the path + lable + labels[idx] and same for image
        paths = os.listdir(path)
        return sorted(paths)
    
    def __get_best_matching_ix(self, target, embed_layer):
        _ , sorted_idx = torch.sort(target, descending=True)
        
        for idx in sorted_idx:
            if embed_layer[..., idx, 0] == 0:
                return idx
            else:
                continue
            
        return None
    
    def __embbed_label(self, label:torch.Tensor, embed_layer:torch.Tensor, anchor:torch.Tensor):
        #from my model i have calculated the final layer will give me 15 x 15 grid
        #channel are just Anchor * (5 + no of classes)
        #b x c x 15 x 15   #gridx, gridy, a, value
        c, cx, cy, w, h, = label
        
        #just one_hotting the class its easy to when we finnaly attach all the thing to gether on to the embbed layer
        c = F.one_hot(c, 2)
        
        #claculation of the grid and the object local pos inside the grid
        x_grid, y_grid = int(self.grid_size * cx), int(self.grid_size * cy)
        local_x, local_y = self.grid_size * cx - self.grid_size, self.grid_size * cy - self.grid_size
        
        #objectnedss is always one if the object is available
        objectness = 1
        
        #getting the best matching anchore idx and assigning the box to it if best is not available will go for the other (three idx in my case)
        gt_box = torch.tensor([0, 0, w, h])
        anchor_boxes = torch.cat(torch.zeros_like(anchor), anchor, dim=1)
        intersection = torch.min(gt_box[2:], anchor_boxes[:, 2:])
        inter_area = intersection[:, 0] * intersection[:, 1]
        anchor_area = anchor_boxes[:, 0] * anchor_boxes[:, 1]
        gt_area = w * h
        union = gt_area + anchor_area - inter_area #formula area_01 + area_02 - intersection_area
        ious = inter_area / (union + 1e-6) # this is just to eleminate the zero division by adding a very small number
        best_ious_idx = self.__get_best_matching_ix(ious, embed_layer) #will return the best matching index by comparing
        if best_ious_idx is None:
            return embed_layer #return the original embbedlayer if the anchore is not available
        
        #assign the box
        embed_layer[x_grid, y_grid, best_ious_idx] = torch.cat(torch.tensor[objectness, local_x, local_y, w, h],
                                                               c) #this why i want to do the one hot in advance so that 
                                                                  #this line of code can be one line
        return embed_layer
    
    def __decode_lable(self, idx):
        #reading the text file
        with open(os.path.join(self.path, "labels" ,self.labels[idx])) as file:
            print(file.read())
            label_tensor = torch.tensor(
                            [[torch.tensor(x, dtype=torch.float32) for x in line.strip().split()] for line in file if line.strip()],
                            dtype=torch.float64
                            )
        
        #anchor boxes i have just opened the images and looked in to the shape defined some bacis shapes,
        # like vertical square horizonal squares and rectangles    
        anchor = torch.tensor([[0.937,0.1875],
                               [0.375, 0.5156],
                               [0.15, 0.15],
                               [0.25, 0.25],
                               ])
        
        #embbed layer initial
        embbed_layer = torch.zeros((15, 15, 4, 5+2))
        
        #looping through all the label in single image
        for label in label_tensor:
            embbed_layer = self.__embbed_label(label, embbed_layer, anchor)
            
        return embbed_layer
    
    
    def __get_image(self, idx):
        #openning the image
        image = Image.open(os.path.join(self.path, "images" ,self.images[idx]))
        image = image.resize((256, 256))
        image = self.transform(image)
        return image
    
    def __getitem__(self, index):
        lable = self.__decode_lable(index)
        image = self.__get_image(index)
        return image, lable

In [5]:
def get_loss(output, target, obj_llamda=1.0, noobj_llamda=0.5, box_llamda=1.0, class_llamda=1.0):
    # output shape after reshaping (b, s, s, a, 5 + c)
    obj_mask = target[...,0]== 1
    noobj_mask = target[...,0] == 0
    
    #obj_loss
    #the object loss is summed separatly so that we can panalties the no object box which are not at all requried
    obj_loss = F.binary_cross_entropy_with_logits(output[...,0], target[...,0], reduction='none')
    obj_loss = obj_llamda * obj_loss[obj_mask].sum() + noobj_llamda * obj_loss[noobj_mask].sum()
    
    #box_loss
    if obj_mask.sum() > 0:
        box_loss = F.smooth_l1_loss(output[..., 1:5], target[...,1:5], reduction='sum')
        # box_loss = box_loss * box_llamda
    
        #class_loss
        pred_class = output[..., 5:][obj_mask]
        target_class = output[..., 5:][obj_mask].argmax(dim=-1)
        class_loss = F.cross_entropy(pred_class, target_class, reduction='sum')
        # class_loss = class_loss * class_llamda
        
    else:
        box_loss = torch.tensor(0.0, device=output.device)
        class_loss = torch.tensor(0.0, device=output.device)
    
    total_loss = obj_loss + (class_loss * class_llamda) + box_loss * box_llamda
    
    return total_loss, obj_loss, class_loss, box_loss

In [57]:
dataset = DDataset("/home/nileshtn/Desktop/detection/Data/train", 15, 4)

In [58]:
dataset[1]

0 0.40546875 0.4578125 0.0734375 0.1875


(tensor([[[-0.2863, -0.2706, -0.3020,  ..., -0.2784, -0.3882, -0.4824],
          [-0.1686, -0.2471, -0.2627,  ..., -0.3176, -0.4118, -0.4510],
          [-0.2549, -0.3098, -0.2471,  ..., -0.2549, -0.2941, -0.4196],
          ...,
          [-0.7490, -0.7412, -0.7255,  ..., -0.7412, -0.7647, -0.7098],
          [-0.7333, -0.7333, -0.7255,  ..., -0.6000, -0.7176, -0.7725],
          [-0.7176, -0.7020, -0.6784,  ..., -0.5843, -0.5451, -0.7020]],
 
         [[-0.2549, -0.2392, -0.2784,  ..., -0.3569, -0.4902, -0.5922],
          [-0.1373, -0.2157, -0.2392,  ..., -0.4118, -0.5216, -0.5686],
          [-0.2314, -0.2863, -0.2235,  ..., -0.3569, -0.4196, -0.5451],
          ...,
          [-0.7255, -0.7098, -0.7020,  ..., -0.6235, -0.7255, -0.7490],
          [-0.6863, -0.6863, -0.6784,  ..., -0.4353, -0.6314, -0.7490],
          [-0.6706, -0.6549, -0.6314,  ..., -0.3882, -0.4196, -0.6392]],
 
         [[ 0.2392,  0.2549,  0.2471,  ..., -0.2235, -0.3412, -0.4118],
          [ 0.3569,  0.2941,

In [30]:
a = torch.tensor([1, 0, 0, 0])
torch.cat((torch.tensor([2, 4, 6, 7]), a))

tensor([2, 4, 6, 7, 1, 0, 0, 0])

In [14]:
h = 0.1
w = 0.2
gt_box = torch.tensor([0, 0, h, w])
anchors = torch.tensor([[0.1, 0.04],
                        [0.12, 0.08],
                        [0.14, 0.1],
                        [0.04, 0.1]])
anchor_boxes = torch.cat([torch.zeros_like(anchors), anchors], dim=1)

In [17]:
inter = torch.min(gt_box[2:], anchor_boxes[:, 2:])
inter_area = inter[:,0] * inter[:,1]
gt_area = h * w
anchor_area = anchors[:,0] * anchors[:,1] 
print("intersection points : ",inter)
print("intersection area : ", inter_area)
print("anchor area :", anchor_area)
print("gt_area :", gt_area)

intersection points :  tensor([[0.1000, 0.0400],
        [0.1000, 0.0800],
        [0.1000, 0.1000],
        [0.0400, 0.1000]])
intersection area :  tensor([0.0040, 0.0080, 0.0100, 0.0040])
anchor area : tensor([0.0040, 0.0096, 0.0140, 0.0040])
gt_area : 0.020000000000000004


In [23]:
union_area = anchor_area + gt_area - inter_area
print("intersection_point : ", union_area)
ious = inter_area/(union_area + 1e-6)
best_anchor = torch.argmax(ious)
print("index of best anchore : ", best_anchor.item())

intersection_point :  tensor([0.0200, 0.0216, 0.0240, 0.0200])
index of best anchore :  2


In [36]:
hello = torch.zeros([15, 15, 4, 3])

In [37]:
hello[0,0,0] = torch.ones([3])

In [38]:
hello

tensor([[[[1., 1., 1.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         ...,

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]]],


        [[[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],

         ...,

         [[0., 0., 0.],
          [0., 0., 0.],
          [0., 0., 0.],
          [0.,