In [75]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import torch.optim as optim
import torchcontrib.optim as optim_contrib
import torch.nn.functional as F
from torchvision import transforms
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torchvision
from PIL import Image
from sklearn.metrics import accuracy_score

In [2]:
torch.cuda.is_available()

True

In [3]:
from google.colab import drive

drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
PATH="/content/gdrive/MyDrive"
os.listdir(PATH)

['Colab Notebooks', 'train_images', 'csv']

# Declaring Global Varibles

In [5]:
CSV_PATH=PATH+"/csv"
IMAGE_PATH=PATH+"/train_images"
BS=8
LR=0.1
WEIGHT_DECAY=5E-4
EPOCH=10

# Importing data

In [6]:
data=pd.read_csv(CSV_PATH+"/train.csv")
data.head()

Unnamed: 0,image_name,label,template_name,xmin,ymin,xmax,ymax
0,0181 copie.jpg,0181 copie_0,0181 copie_0_0.jpg,216,696,545,1040
1,0181 copie.jpg,0181 copie_0_flipped,0181 copie_0_flipped_0.jpg,852,687,1210,1038
2,0181 copie.jpg,0181 copie_1,0181 copie_1_0.jpg,557,701,861,1080
3,0181 copie.jpg,0181 copie_2,0181 copie_2_0.jpg,194,1201,419,1380
4,0181 copie.jpg,0181 copie_2,0181 copie_2_1.jpg,1019,1201,1275,1379


In [7]:
image_names=data['image_name'].unique()
print(f"We have {len(image_names)} images")

We have 18 images


In [8]:
# Store the arrays of the image
image_names=list(data['image_name'].unique())
imageArray=pd.DataFrame({"image_name":[],"image_array":[]})
for im in image_names:
  img=Image.open(IMAGE_PATH+"/"+im)
  imageArray=pd.concat([imageArray,pd.DataFrame({"image_name":[im],"image_array":[np.array(img)]})])
  del img

In [9]:
imageArray.head()

Unnamed: 0,image_name,image_array
0,0181 copie.jpg,"[[[145, 138, 132], [140, 133, 127], [140, 133,..."
0,0183 copie.jpg,"[[[153, 146, 140], [165, 158, 152], [133, 126,..."
0,0185 copie.jpg,"[[[163, 156, 148], [156, 149, 141], [151, 144,..."
0,0187 copie.jpg,"[[[151, 144, 138], [151, 144, 138], [148, 141,..."
0,0189 copie.jpg,"[[[143, 136, 130], [146, 139, 133], [133, 126,..."


# Pretext Task

    > Predict Scale

In [33]:
# We won't save the arrays of the rescaled image, we will save the scaling factor and coordinates for crop
scaleData=pd.DataFrame({"image_name":[],"x_min":[],"y_min":[],"x_max":[],"y_max":[],"scaling_factor":[]})
scaleData["x_min"]=scaleData["image_name"].apply(lambda x:0)
scaleData["y_min"]=scaleData["image_name"].apply(lambda x:0)
scaleData["x_max"]=scaleData["image_name"].apply(lambda x:
                                                 imageArray[imageArray["image_name"]==x]["image_array"][0].shape[1])
scaleData["y_max"]=scaleData["image_name"].apply(lambda x:
                                                 imageArray[imageArray["image_name"]==x]["image_array"][0].shape[0])

In [84]:
# We will randomly crop an image and scale it to 300 * 300 and store the scaling factor as a target variable
scaling_factors=np.arange(1,3.1,0.1)
def rescale_crop(num,image_name):
    """ Randomly crop an image and scale it to 300 * 300 """
    global scaleData
    global data
    global scaling_factors

    h,w=imageArray[imageArray['image_name']==image_name]["image_array"].values[0].shape[:2]

    # Choose randomly a scaling factor
    sc=np.random.choice(scaling_factors)

    # Calculate the height and width
    new_w=int(300*sc)
    new_h=int(300*sc)

    # Choose randomly the coordinates
    new_x=int(np.random.randint(0,w-2*new_w))
    new_y=int(np.random.randint(0,h-new_h))

    # image name
    name=image_name.split(".")[0]+"_scale_"+str(num)+".jpg"

    scaleData=pd.concat([scaleData,pd.DataFrame({"image_name":[name],"x_min":[new_x],
    "y_min":[new_y],"x_max":[new_x+new_w],"y_max":[new_y+new_h],"scaling_factor":[sc]})])

In [35]:
%%time
for im in image_names:
    for i in range(200):
        rescale_crop(i,im)

CPU times: user 10.6 s, sys: 279 ms, total: 10.9 s
Wall time: 10.6 s


# New Section

In [85]:
scaleData=pd.read_csv("/content/gdrive/MyDrive/csv/scaleData.csv")

In [86]:
scaleData.head()

Unnamed: 0,image_name,x_min,y_min,x_max,y_max,scaling_factor
0,0181 copie_scale_0.jpg,562.0,2536.0,1282.0,3256.0,2.4
1,0181 copie_scale_1.jpg,939.0,3042.0,1269.0,3372.0,1.1
2,0181 copie_scale_2.jpg,88.0,1868.0,478.0,2258.0,1.3
3,0181 copie_scale_3.jpg,1203.0,486.0,1533.0,816.0,1.1
4,0181 copie_scale_4.jpg,901.0,2204.0,1471.0,2774.0,1.9


In [87]:
le=LabelEncoder()
scaleData['scaling_factor']=le.fit_transform(scaleData['scaling_factor'])

In [36]:

#scaleData.to_csv(CSV_PATH+"/"+"scaleData.csv",index=None)

In [119]:
class ScaleDataset(Dataset):
    def __init__(self, dataset, image_dataset, is_test=False, transform=None):
        #self.annotation_folder_path = csv_path
        self.dataset=dataset
        self.image_dataset=image_dataset
        self.all_images=self.dataset['image_name'].unique()
        self.transform = transform
        self.is_test = is_test
    
    def __getitem__(self,idx):
        img_name=self.all_images[idx]
        original_img_name=img_name.split("_")[0]+".jpg"
        coord=self.dataset[self.dataset['image_name']==img_name][["x_min","y_min","x_max","y_max"]].values[0]
        img=Image.fromarray(self.image_dataset[self.image_dataset['image_name']==original_img_name]['image_array'].values[0][
            int(coord[1]):int(coord[3]),int(coord[0]):int(coord[2])])
        #img=img.convert("RGB")

        if not self.is_test:
            annotations=self.dataset[self.dataset['image_name']==img_name]

            #self.box = self.get_xy(annotations)

            #self.new_box = torch.FloatTensor(self.box_resize(self.box, img))
            if self.transform is not None:
                img = self.transform(img)
            

            self.labels=torch.FloatTensor(annotations['scaling_factor'].values)

            return img, self.labels
        else:
            return img
    
    def __len__(self):
        return len(self.all_images)
        
    
    def collate_fn(self, batch):
        """
        :param batch: an iterable of N sets from __getitem__()
        :return: a tensor of images, lists of varying-size tensors of bounding boxes, labels, and difficulties
        """

        images = list()
        labels = list()
#         difficulties = list()

        for b in batch:
            images.append(b[0])
            labels.append(b[1])
#             difficulties.append(b[3])

        images = torch.stack(images, dim=0)

        return images, labels



In [120]:
tsfm = transforms.Compose([
    transforms.Resize([300,300]),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

In [121]:
x_train,x_test=train_test_split(scaleData,test_size=0.15,random_state=1)

In [122]:
train_ds = ScaleDataset(x_train,imageArray,transform=tsfm)
train_dl = DataLoader(train_ds, batch_size=BS, shuffle=True, collate_fn=train_ds.collate_fn)

valid_ds = ScaleDataset(x_test,imageArray, transform=tsfm)
valid_dl = DataLoader(valid_ds, batch_size=BS, shuffle=True, collate_fn=valid_ds.collate_fn)

In [117]:
x=next(iter(train_dl))

In [118]:
print(f"Size of the Patch {x[0][0].shape}, scaling factor {le.inverse_transform([int(x[1][0].item())])[0]}")

Size of the Patch torch.Size([3, 690, 690]), scaling factor 2.3000000000000007


In [133]:
def decimate(tensor, m):
    """
    Decimate a tensor by a factor 'm', i.e. downsample by keeping every 'm'th value.
    This is used when we convert FC layers to equivalent Convolutional layers, BUT of a smaller size.
    :param tensor: tensor to be decimated
    :param m: list of decimation factors for each dimension of the tensor; None if not to be decimated along a dimension
    :return: decimated tensor
    """
    assert tensor.dim() == len(m)
    for d in range(tensor.dim()):
        if m[d] is not None:
            tensor = tensor.index_select(dim=d,
                                         index=torch.arange(start=0, end=tensor.size(d), step=m[d]).long())

    return tensor
class VGGBase(nn.Module):
    """
    VGG base convolutions to produce lower-level feature maps.
    """

    def __init__(self):
        super(VGGBase, self).__init__()

        # Standard convolutional layers in VGG16
        self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)  # stride = 1, by default
        self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)  # ceiling (not floor) here for even dims

        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.conv6=nn.Conv2d(512,256,kernel_size=1)
        self.conv7=nn.Conv2d(256,64,kernel_size=1)
        self.pool5=nn.MaxPool2d(kernel_size=2)

        # Linear Layers
        self.output=nn.Linear(in_features=64*9*9,out_features=21)


    def forward(self, image):
        """
        Forward propagation.
        :param image: images, a tensor of dimensions (N, 3, 300, 300)
        :return: lower-level feature maps conv4_3 and conv7
        """
        out = F.relu(self.conv1_1(image))  # (N, 64, 300, 300)
        out = F.relu(self.conv1_2(out))  # (N, 64, 300, 300)
        out = self.pool1(out)  # (N, 64, 150, 150)

        out = F.relu(self.conv2_1(out))  # (N, 128, 150, 150)
        out = F.relu(self.conv2_2(out))  # (N, 128, 150, 150)
        out = self.pool2(out)  # (N, 128, 75, 75)

        out = F.relu(self.conv3_1(out))  # (N, 256, 75, 75)
        out = F.relu(self.conv3_2(out))  # (N, 256, 75, 75)
        out = F.relu(self.conv3_3(out))  # (N, 256, 75, 75)
        out = self.pool3(out)  # (N, 256, 38, 38), it would have been 37 if not for ceil_mode = True

        out = F.relu(self.conv4_1(out))  # (N, 512, 38, 38)
        out = F.relu(self.conv4_2(out))  # (N, 512, 38, 38)
        out = F.relu(self.conv4_3(out))  # (N, 512, 38, 38)
        conv4_3_feats = out  # (N, 512, 38, 38)
        out = self.pool4(out)  # (N, 512, 19, 19)

        out = F.relu(self.conv5_1(out))  # (N, 512, 19, 19)
        out=F.relu(self.conv6(out))
        out=F.relu(self.conv7(out))
        out=F.relu(self.pool5(out))

        # Linear Layers
        flattened_array=out.reshape(-1,64*9*9)
        output=F.softmax(self.output(flattened_array))

        # Return output
        return output

In [134]:
vgg=VGGBase().to("cuda:0")

In [135]:
summary(vgg,(3,300,300))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 300, 300]           1,792
            Conv2d-2         [-1, 64, 300, 300]          36,928
         MaxPool2d-3         [-1, 64, 150, 150]               0
            Conv2d-4        [-1, 128, 150, 150]          73,856
            Conv2d-5        [-1, 128, 150, 150]         147,584
         MaxPool2d-6          [-1, 128, 75, 75]               0
            Conv2d-7          [-1, 256, 75, 75]         295,168
            Conv2d-8          [-1, 256, 75, 75]         590,080
            Conv2d-9          [-1, 256, 75, 75]         590,080
        MaxPool2d-10          [-1, 256, 38, 38]               0
           Conv2d-11          [-1, 512, 38, 38]       1,180,160
           Conv2d-12          [-1, 512, 38, 38]       2,359,808
           Conv2d-13          [-1, 512, 38, 38]       2,359,808
        MaxPool2d-14          [-1, 512,



In [136]:
class COCOBBackprop(optim.Optimizer):

  """ From https://github.com/anandsaha/nips.cocob.pytorch/blob/master/cocob.py """
    
  def __init__(self, params, alpha=100, epsilon=1e-8):
      
      self._alpha = alpha
      self.epsilon = epsilon
      defaults = dict(alpha=alpha, epsilon=epsilon)
      super(COCOBBackprop, self).__init__(params, defaults)
      
  def step(self, closure=None):
      
      loss = None
      
      if closure is not None:
          loss = closure()
          
      for group in self.param_groups:
          for p in group['params']:
              if p.grad is None:
                  continue
      
              grad = p.grad.data
              state = self.state[p]
              
              if len(state) == 0:
                  state['gradients_sum'] = torch.zeros_like(p.data).cuda().float()
                  state['grad_norm_sum'] = torch.zeros_like(p.data).cuda().float()
                  state['L'] = self.epsilon * torch.ones_like(p.data).cuda().float()
                  state['tilde_w'] = torch.zeros_like(p.data).cuda().float()
                  state['reward'] = torch.zeros_like(p.data).cuda().float()
                  
              gradients_sum = state['gradients_sum']
              grad_norm_sum = state['grad_norm_sum']
              tilde_w = state['tilde_w']
              L = state['L']
              reward = state['reward']
              
              zero = torch.cuda.FloatTensor([0.])
              
              L_update = torch.max(L, torch.abs(grad))
              gradients_sum_update = gradients_sum + grad
              grad_norm_sum_update = grad_norm_sum + torch.abs(grad)
              reward_update = torch.max(reward - grad * tilde_w, zero)
              new_w = -gradients_sum_update/(L_update * (torch.max(grad_norm_sum_update + L_update, self._alpha * L_update)))*(reward_update + L_update)
              p.data = p.data - tilde_w + new_w
              tilde_w_update = new_w
              
              state['gradients_sum'] = gradients_sum_update
              state['grad_norm_sum'] = grad_norm_sum_update
              state['L'] = L_update
              state['tilde_w'] = tilde_w_update
              state['reward'] = reward_update

      return loss

In [137]:
criterion=nn.CrossEntropyLoss()
optimizer=COCOBBackprop(vgg.parameters())

In [138]:

for epoch in range(EPOCH):
    vgg.train()
    train_loss=[]
    train_accuracy=[]
    test_accuracy=[]
    test_loss=[]
    for step,(img,labels) in enumerate(train_dl):
        labels=torch.tensor(labels)
        labels=labels.long()
        labels=labels.to("cuda:0")
        img=img.to("cuda:0")
        pred=vgg(img)
        loss=criterion(pred,labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())
        accuracy=accuracy_score(labels.tolist(),torch.argmax(pred,dim=1).tolist())
        train_accuracy.append(accuracy)

    with torch.no_grad():
      for step,(img,labels) in enumerate(valid_dl):
        labels=torch.tensor(labels)
        labels=labels.long()
        labels=labels.to("cuda:0")
        img=img.to("cuda:0")
        pred=vgg(img)
        val_loss=criterion(pred,labels)

        test_loss.append(val_loss.item())
        accuracy=accuracy_score(labels.tolist(),torch.argmax(pred,dim=1).tolist())
        test_accuracy.append(accuracy)

    
    print(f"Epoch {epoch+1}, train loss: {np.mean(train_loss)}, test loss: {np.mean(test_loss)}\n")
    print(f"Train Accuracy:{np.mean(train_accuracy)}, Test Accuracy:{np.mean(test_accuracy)}\n")





Epoch 1, train loss: 3.0713974968882827, test loss: 3.0808904872221103

Train Accuracy:0.05254569190600522, Test Accuracy:0.042279411764705885

Epoch 2, train loss: 3.070624207080811, test loss: 3.0808904872221103

Train Accuracy:0.05254569190600522, Test Accuracy:0.042279411764705885

Epoch 3, train loss: 3.070624207080811, test loss: 3.0808904872221103

Train Accuracy:0.05254569190600522, Test Accuracy:0.042279411764705885

Epoch 4, train loss: 3.070624207080811, test loss: 3.0808904872221103

Train Accuracy:0.05254569190600522, Test Accuracy:0.042279411764705885

Epoch 5, train loss: 3.070624207080811, test loss: 3.0808904872221103

Train Accuracy:0.05254569190600522, Test Accuracy:0.042279411764705885

Epoch 6, train loss: 3.070297836323631, test loss: 3.079052251927993

Train Accuracy:0.05287206266318538, Test Accuracy:0.04411764705882353

Epoch 7, train loss: 3.070297836323631, test loss: 3.0808904872221103

Train Accuracy:0.05287206266318538, Test Accuracy:0.042279411764705885

