In [4]:
from google.colab import drive

drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [5]:
import os
os.chdir("/content/gdrive/MyDrive")
os.listdir()

['Colab Notebooks',
 'train_images',
 'csv',
 'SSD',
 'utils',
 '.ipynb_checkpoints']

In [91]:
%matplotlib inline
import numpy as np # Linear Algebra
import gc
import pandas as pd # Data Processing, CSV file I/O (e.g. pd.read_csv)

import torch
from torchvision import models
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn.functional as F
import torchvision
from torchvision.ops import RoIAlign
from torchsummary import summary


from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
import warnings
warnings.filterwarnings("ignore")
import timeit
from SSD.box_utils import cxcy_to_gcxgcy,cxcy_to_xy,gcxgcy_to_cxcy,xy_to_cxcy,find_jaccard_overlap,get_target_image
from SSD.ssd import SSD
from SSD.loss import MultiBoxLoss
from SSD.dataset import SSDDataset
from SSD.iou import bb_intersection_over_union
from SSD.decoder import Decoder

In [7]:
CUDA_LAUNCH_BLOCKING=1

In [8]:
imageData=pd.read_csv("/content/gdrive/MyDrive/csv/imageData_3.csv")
data=pd.read_csv("/content/gdrive/MyDrive/csv/data_3.csv")

In [9]:
image_names=list(data['image_name'].unique())
imageArray=pd.DataFrame({"image_name":[],"image_array":[]})
for im in image_names:
  try:
    img=Image.open("train_images/"+im)
    imageArray=pd.concat([imageArray,pd.DataFrame({"image_name":[im],"image_array":[np.array(img)]})])
    del img
  except:
    pass

# Dataset Genertation

In [82]:
class DecoderDataset(Dataset):
    def __init__(self, detector,data, imageData, imageArray, is_test=False, transform=None,output_size=(5,5)):
        #self.annotation_folder_path = csv_path
        self.detector=detector # Object Detector
        self.data=data # Contains the information about bounding boxes
        self.imageData=imageData # Contains the coordinate of the cropped images
        self.imageArray=imageArray # Contains the arrays of the original 18 images
        self.all_images=self.data['image_name'].unique()
        self.transform = transform
        self.is_test = is_test
        self.output_size=output_size
        # ROIALIGN
        self.roialign=RoIAlign(self.output_size,1,-1)
        
    def __getitem__(self, idx):
        img_name = self.all_images[idx]
        if "_" in img_name:
          original_img_name=img_name.split("_")[0]+".jpg"
        else:
          original_img_name=img_name
        coord=self.imageData[self.imageData['image_name']==img_name][["x_min","y_min","x_max","y_max"]].values[0]
        img = Image.fromarray(self.imageArray[self.imageArray['image_name']==original_img_name]['image_array'].values[0][
            int(coord[1]):int(coord[3]),int(coord[0]):int(coord[2]),:])
        img = img.convert('RGB')
        
        if not self.is_test:
            annotations=self.data[self.data['image_name']==img_name]

            self.box = self.get_xy(annotations)

            self.new_box = torch.cuda.FloatTensor(self.box_resize(self.box, img))
            if self.transform:
                img = self.transform(img)
            

            #self.labels=torch.FloatTensor(annotations['label'].values).cuda()

            """# Encode the labels with Int
            self.le=LabelEncoder()
            self.labels=torch.FloatTensor(self.le.fit_transform(self.labels))"""

            return img_name,img.cuda(), self.new_box
            #return img_name,img,self.new_box
        else:
            return img_name,img.cuda()
    
    def __len__(self):
        return len(self.all_images)
        
    def get_xy(self, annotation):
        boxes=torch.cuda.FloatTensor(annotation[['xmin','ymin','xmax','ymax']].values)
        return boxes
        
    def box_resize(self, box, img, dims=(300, 300)):
        old_dims = torch.cuda.FloatTensor([img.width, img.height, img.width, img.height]).unsqueeze(0)
        #old_dims=torch.FloatTensor([img.width, img.height, img.width, img.height]).unsqueeze(0)
        new_box = box.cuda() / old_dims
        new_dims = torch.cuda.FloatTensor([dims[1], dims[0], dims[1], dims[0]]).unsqueeze(0)
        #new_dims = torch.FloatTensor([dims[1], dims[0], dims[1], dims[0]]).unsqueeze(0)
        #new_box = new_box * new_dims
        
        return new_box
    
    def collate_fn(self, batch):
        """
        :param batch: an iterable of N sets from __getitem__()
        :return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties
        """
        image_names=list() # Name of the images
        images = list()
        boxes = list()
        labels = list()
#         difficulties = list()

        for b in batch:
            image_names.append(b[0])
            images.append(b[1])
            boxes.append(b[2])
            #labels.append(b[3])
#             difficulties.append(b[3])

        images = torch.stack(images, dim=0)
        # Get RoI's offset
        self.locs,self.classes=self.detector(images)
        self.classes=self.classes.cpu().detach()
        self.locs=self.locs.cpu().detach()

        # Transfer ssd style coordinates to xy coordinates
        self.__batch_size=self.locs.size(0)
        for i in range(self.__batch_size):
          self.locs[i]=cxcy_to_xy(gcxgcy_to_cxcy(self.locs[i],self.detector.priors_cxcy.cpu()))

        # Get all the feature maps
        self.get_feature_maps(images) #(Batch Size, Num of Object, 5, 5)
        self.make_single_box()

        return image_names,self.feature_map, self.locations,boxes  # tensor (N, 3, 300, 300),
    
    def get_feature_maps(self,x):
      """ Get all the feature maps from the ssd detector"""
      
      self.rescale_factors = nn.Parameter(torch.cuda.FloatTensor(1, 512, 1, 1))  # there are 512 channels in conv4_3_feats
      # ROI Postions in Locs data Dictinary
      self.__roi_pos_dict={"conv4_3":[0,38*38],"conv7":[38*38,38*38+19*19],
                  "conv8_2":[38*38+19*19,38*38+19*19+10*10],"conv9_2":[38*38+19*19+10*10,38*38+19*19+10*10+5*5],
                  "conv10_2":[38*38+19*19+10*10+5*5, 38*38+19*19+10*10+5*5+3*3]}
      self.feature_map_size={"conv4_3":38,"conv7":19,"conv8_2":10,"conv9_2":5,"conv10_2":3}

      # Get Conv4 and Conv7 from VGG
      self.conv4_3,self.conv7=self.detector.base(x)

      # Rescale conv4_3 after L2 norm
      norm = self.conv4_3.pow(2).sum(dim=1, keepdim=True).sqrt()  # (N, 1, 38, 38)
      self.conv4_3 = self.conv4_3 / norm  # (N, 512, 38, 38)
      self.conv4_3 = self.conv4_3 * self.rescale_factors 

      # Get rest of the feature maps from conv7
      self.conv8_2, self.conv9_2, self.conv10_2, self.conv11_2 = self.detector.aux_convs(self.conv7)

      self.__feat_map_dict={"conv4_3":self.conv4_3,"conv7":self.conv7,
                    "conv8_2":self.conv8_2,"conv9_2":self.conv9_2,"conv10_2":self.conv10_2,
                    "conv11_2":self.conv11_2}


      # For every object get it respective locations in feature map
      self.locations,self.scores=self.get_map_for_objects()

      # Merge feature maps 
      self.feature_map=self.merge_feature_maps()
    
    def get_map_for_objects(self):
      # Get the locations of objects in feature maps
      locations=[]
      scores=[]
      for i in range(self.__batch_size):
        sc_bx,cl_bx=self.classes[i].max(dim=1)
        un_cl=torch.unique(cl_bx)
        # The locations of objects in every feature maps
        class_dict={}
        # The scores for the locations
        score_dict={}
        for num,k in enumerate(un_cl):
          # We don't need background so eliminate object 0
          if k!=0:
            new_bx_k=dict()
            score_for_k=dict()
            bx_k=(cl_bx==k).nonzero(as_tuple=True)[0].cpu()
            for rp in list(self.__roi_pos_dict.keys()):
              min_,max_=self.__roi_pos_dict[rp]
              max_bx_k=bx_k[bx_k<max_]
              range_bx_k=max_bx_k[max_bx_k>=min_]
              # Extract the feature m0ap name where positions belong
              try:
                locs_for_k=torch.clamp(self.locs[i][range_bx_k[torch.argmax(sc_bx[range_bx_k]).item()].item()],min=0,max=1)*self.feature_map_size[rp]
                score_for_k[rp]=torch.max(sc_bx[range_bx_k]).item()
                new_bx_k[rp]=locs_for_k.tolist()
              except:
                pass
            class_dict[k.cpu().item()]=new_bx_k
            score_dict[k.cpu().item()]=score_for_k
        locations.append(class_dict)
        scores.append(score_dict)
      return locations,scores
      
    
    def merge_feature_maps(self):
      """ Merge the Fearure Maps for every object."""
      feature_map=[]
      upsample=nn.Upsample((2560,5))
      for i in range(self.__batch_size):
        # Feature Maps for every Batch. Separate feature maps for separate images.
        feature_map_bt=[]
        for ob in self.locations[i].keys():
          # Feature maps for every object. We will concatenate the feature maps
          temp_ob=[]
          for fm in self.locations[i][ob].keys():
            roi=self.locations[i][ob][fm]
            roi=[0]+roi
            roi=torch.cuda.FloatTensor(roi).unsqueeze(0)
            aligned_image=self.roialign(self.__feat_map_dict[fm],roi)
            temp_ob.append(aligned_image)
          
          temp_ob=torch.moveaxis(torch.cat(temp_ob,dim=1),1,2)
          temp_ob=upsample(temp_ob)
          temp_ob=torch.moveaxis(temp_ob,2,1)
          feature_map_bt.append(temp_ob)
        feature_map.append(torch.cat(feature_map_bt,dim=0))
      
      return feature_map
    
    def make_single_box(self):
      """ Some images have more than one boxes as they are captured in more than one feature maps. 
      We need to take the one with maximum scores"""
      for i,lc in enumerate(self.locations):
        for j,ob in enumerate(lc.keys()):
          
          if len(list(lc[ob].keys()))>1:
            represented_box=np.argmax(list(self.scores[i][ob].values()))
            self.scores[i][ob]=np.max(list(self.scores[i][ob].values()))
            lc[ob]=torch.tensor(lc[ob][list(lc[ob].keys())[represented_box]]).to("cuda")/self.feature_map_size[list(lc[ob].keys())[represented_box]]
          else:
            self.scores[i][ob]=list(self.scores[i][ob].values())[0]
            lc[ob]=torch.tensor(lc[ob][list(lc[ob].keys())[0]]).to("cuda")/self.feature_map_size[list(lc[ob].keys())[0]]

In [83]:
tsfm = transforms.Compose([
    transforms.Resize([300,300]),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])
tsfmV2=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])
# Batch Size
BS=4

In [84]:
N_CLASSES=data['label'].nunique()+1
model=SSD(n_classes=N_CLASSES)
model.load_state_dict(torch.load("/content/gdrive/MyDrive/SSD/model_giou_1b.pth"))


Loaded base model.



<All keys matched successfully>

In [85]:
train_ds = DecoderDataset(model,data.iloc[4606:,:], imageData,imageArray, transform=tsfm)
train_dl = DataLoader(train_ds, batch_size=BS, shuffle=True, collate_fn=train_ds.collate_fn)

# Load Pretrained SSD Model

In [86]:
N_CLASSES=data['label'].nunique()+1
model=SSD(n_classes=N_CLASSES)
model.load_state_dict(torch.load("/content/gdrive/MyDrive/SSD/model_giou_1b.pth"))


Loaded base model.



<All keys matched successfully>

In [100]:
class Decoder1(nn.Module):
  def __init__(self):
    super(Decoder1,self).__init__()
    """ 
    Parameter:
    detector --> Object Detector
    output_size --> Output size of ROI Align step. Default (5,5)
    """
    # Decoding Layers
    self.convT_1=nn.ConvTranspose2d(2560,1024,kernel_size=3,stride=2) #(1024, 11, 11)
    self.convT_2=nn.ConvTranspose2d(1024,512,kernel_size=3,stride=2) #(512, 23, 23)
    self.convT_3=nn.ConvTranspose2d(512,256,kernel_size=3,stride=1) #(256, 25, 25)

    self.convT_4=nn.ConvTranspose2d(256,128,kernel_size=3,stride=2) # (128, 51, 51)
    self.convT_5=nn.ConvTranspose2d(128,3,kernel_size=3,stride=2) # (3, 103, 103)
    self.upsample_1=nn.Upsample((100,100)) # (3, 100, 100)


  def forward(self,x):
    # Output Images for batches
    self.__batch_size=len(x)
    output=[]
    for i in range(self.__batch_size):
      out=F.relu(self.convT_1(x[i]))
      out=F.relu(self.convT_2(out))
      out=F.relu(self.convT_3(out))
      out=F.relu(self.convT_4(out))
      out=F.relu(self.convT_5(out))
      #out=F.relu(self.convT_6(out))
      #out=F.relu(self.convT_7(out))
      out=self.upsample_1(out)
      output.append(out)

    return output

In [101]:
EPOCH=10
LR=0.01

In [102]:
class DecoderLoss(nn.Module):
  def __init__(self,losstype="mse"):
    super(DecoderLoss,self).__init__()
    self.losstype="mse"
    if self.losstype=="mse":
      self.loss=nn.MSELoss()
  
  def forward(self,image_reconstructed,target_image,pred_loc,actual_loc):    
    # Now we need to match the template of the reconstructed image with target image.
    total_loss=[]
    batch_size=len(pred_loc)

    for batch in range(batch_size):
      loss_per_batch=0
      get_overlap=find_jaccard_overlap(torch.stack(list(pred_loc[batch].values())),actual_loc[batch])
      indices=get_overlap.max(dim=1)[1]
      for k in range(len(indices)):
        image=self.get_image(target_image[batch],actual_loc[batch][indices[k]])
        image=self.resize_image(target_image[batch])
        ls=self.loss(image_reconstructed[batch][k].to("cuda"),image.to("cuda"))
        loss_per_batch+=ls
      total_loss.append(loss_per_batch)
    
    loss=torch.mean(torch.stack(total_loss))
    return loss

    
  def get_image(self,image,box):
    box=torch.round(box*image.size(-1))
    box=[int(b) for b in box.tolist()]
    return image[box[1]:box[3],box[0]:box[2]]
  
  def resize_image(self,image,size=(100,100)):
    transforms_pil=transforms.ToPILImage()
    transforms_tensor=transforms.ToTensor()

    image=transforms_tensor(transforms_pil(image).resize(size,Image.NEAREST))
    return image

In [103]:
decoder=Decoder1().cuda()
criterion=DecoderLoss("mse")
optimizer=torch.optim.Adam(decoder.parameters(),lr=LR)

# Model Training

In [None]:
from tqdm import tqdm_notebook
import time
for epoch in range(1, EPOCH+1):
    decoder.train()
    train_loss = []
    time_start=time.time()
    for step, (target_image_names,feature_maps,locations,boxes) in enumerate(tqdm_notebook(train_dl)):
        time_1 = time.time()
        target_image=get_target_image(target_image_names,data,imageData,imageArray,tsfmV2)
        
        reconstructed_image = decoder(feature_maps)
        #locations=[bx.to("cpu") for bx in locations]
        
        loss = criterion(reconstructed_image,target_image,locations,boxes)
        
        # Back propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.item())
    time_end=time.time()
    total_time=str((time_end-time_start)//60)+" minutes and "+ str(np.round((time_end-time_start)%60))+" seconds "
        
        
    print("Time:",total_time, ' epoch: ', epoch, '/', EPOCH,
            'train loss:', '{:.4f}'.format(np.mean(train_loss)))

HBox(children=(FloatProgress(value=0.0, max=337.0), HTML(value='')))


Time: 4.0 minutes and 58.0 seconds   epoch:  1 / 10 train loss: 45433693.8438


HBox(children=(FloatProgress(value=0.0, max=337.0), HTML(value='')))


Time: 4.0 minutes and 56.0 seconds   epoch:  2 / 10 train loss: 2.7033


HBox(children=(FloatProgress(value=0.0, max=337.0), HTML(value='')))


Time: 4.0 minutes and 55.0 seconds   epoch:  3 / 10 train loss: 2.7033


HBox(children=(FloatProgress(value=0.0, max=337.0), HTML(value='')))