In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install timm

In [None]:
!gdown --folder https://drive.google.com/drive/folders/115l82GBgu6RETopB_Qy36hn4TVwXI2t5?usp=sharing

In [None]:
!mkdir "/content/Datasets"
!mkdir "/content/Datasets/images"
!mkdir "/content/Datasets/images/test"
!unzip "/content/Advanced/CV/Test.zip" -d "/content/Datasets/images/test"

!mkdir "/content/Datasets/images/suspects"
!unzip "/content/Advanced/CV/suspects.zip" -d "/content/Datasets/images/suspects"

In [4]:
import os
import pandas as pd
import cv2
import torch
from torch import nn
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

ROOT = "." # Change this accordingly - tmp location

In [23]:
from PIL import Image
from torchvision import transforms
import cv2
from transformers import  AutoImageProcessor, ResNetModel,ViTImageProcessor , ViTModel
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

class BGR2RGB:
    def __call__(self, image):
        return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
class SquarePad:
    def __call__(self, image):
        max_wh = max(image.shape[:2])
        p_top, p_left = [(max_wh - s) // 2 for s in image.shape[:2]]
        p_bottom, p_right = [max_wh - (s+pad) for s, pad in zip(image.shape[:2], [p_top,p_left])]
        return cv2.copyMakeBorder(image, p_top, p_bottom, p_left, p_right, cv2.BORDER_CONSTANT, None, value = 0)

def load_feature_extractor(model_name):
    if model_name == "resnet":
        model = ResNetModel.from_pretrained("microsoft/resnet-50").to(device).eval()
        transform = transforms.Compose([BGR2RGB(),
            SquarePad(),
            transforms.ToTensor(),
            transforms.Resize((224, 224)),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        
        return model, transform 
    elif model_name == "vit":
        processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
        model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k').to(device).eval()
    
        return model, processor

    elif model_name == "senet":
        model = timm.create_model('seresnet152d', pretrained=True,num_classes = 0).to(device).eval()
        config = resolve_data_config({}, model=model)
        transform = create_transform(**config)
        return model, transform     

class NN_Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NN_Classifier, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Dropout(p = 0.5),
            nn.Linear(input_size, hidden_size),
            nn.GELU(),
            nn.Dropout(p = 0.5),
            nn.Linear(hidden_size, output_size)
        )
    
    def forward(self, x):
        output = self.linear_relu_stack(x)
        return output

def signed_sqrt(x1, x2):
    return torch.sign(x1*x2) * torch.sqrt(torch.abs(x1*x2))

def combine_function(x1, x2):
    return torch.concatenate([x1 + x2, x1 - x2, x2 -x1, x1**2 + x2**2, x1*x2, signed_sqrt(x1, x2)], axis = -1)

In [16]:
submission_csv = pd.read_csv(ROOT + "/submission_probabilities.csv")
submission_csv.head(10)

Unnamed: 0,Image_ID,class,confidence,ymin,xmin,ymax,xmax,vit_2_cls,vit_1_cls,resnet_2_cls,resnet_1_cls,resnet_3_cls,senet_1_cls,senet_2_cls,senet_3_cls
0,image_1112,0,0.956391,0.628919,0.645016,0.694816,0.790418,0.879022,0.942505,0.299726,0.37855,0.477408,0.763721,0.806535,0.75245
1,image_1112,0,0.944007,0.243387,0.589338,0.318815,0.763921,0.079087,0.144072,0.128551,0.050557,0.043182,0.031714,0.021016,0.043715
2,image_1112,0,0.917277,0.461447,0.676808,0.545236,0.786817,0.269358,0.521979,0.137847,0.096935,0.07276,0.208639,0.294031,0.235927
3,image_1550,0,0.984412,0.615909,0.727547,0.738454,0.872014,0.314804,0.487395,0.749146,0.637006,0.777047,0.243104,0.273908,0.273096
4,image_1550,0,0.971659,0.199286,0.670617,0.331255,0.80588,0.829947,0.954759,0.856347,0.608226,0.653994,0.256499,0.125517,0.273248
5,image_1550,0,0.956981,0.424419,0.618619,0.493475,0.813795,0.19269,0.415928,0.658531,0.427732,0.263159,0.011082,0.009006,0.020303
6,image_1481,0,0.974394,0.663104,0.646792,0.75931,0.860687,0.957781,0.98424,0.650876,0.556918,0.422574,0.967236,0.991761,0.947013
7,image_0154,0,0.986508,0.538034,0.64111,0.657833,0.886337,0.115857,0.047401,0.148203,0.11949,0.055876,0.003809,0.005046,0.030377
8,image_0154,0,0.983366,0.344311,0.685167,0.505868,0.858593,0.981118,0.98282,0.556874,0.909601,0.917879,0.839068,0.539289,0.883535
9,image_0154,0,0.933418,0.176906,0.611474,0.276579,0.82487,0.058014,0.018413,0.018447,0.015657,0.003014,8e-05,0.000762,0.002412


# **Using Features extracted by ResNet50 for Reidentification**
Source: https://huggingface.co/microsoft/resnet-50

In [None]:
image_path = "/content/Datasets/images/test"
suspect_path = "/content/Datasets/images/suspects/crops"

submission_csv = pd.read_csv(ROOT + "/submission_yolo8x.csv") # Obtained from Object Detection Notebook

In [None]:
model, processor = load_feature_extractor("resnet")
classifier = NN_Classifier(2048 * 6, 2048, 1).to(device)
classifier.load_state_dict(torch.load(ROOT + "/resnet_best.pt")['model_state_dict'])
classifier.eval()

In [None]:
submission_csv['resnet_cls'] = None

for i in tqdm(range(len(submission_csv))):
    
  # Cropped Toy
  img = cv2.imread(image_path + f"/{submission_csv['Image_ID'][i]}.png")
  img_h, img_w = img.shape[:2]

  bb = submission_csv.iloc[i]
  
  tl = (int(bb["xmin"] * img_w), int(bb["ymin"] * img_h))
  br = (int(bb["xmax"] * img_w), int(bb["ymax"] * img_h))

  cropped_img = img[tl[1]:br[1], tl[0]:br[0]]
  
  img = processor(cropped_img).reshape(1, 3, 224, 224).to(device)
  
  # Suspect
  suspect_img = cv2.imread(suspect_path + f"/{submission_csv['Image_ID'][i]}.png")
  suspect_img = processor(suspect_img).reshape(1, 3, 224, 224).to(device)
  
  with torch.no_grad():
      output_1 = resnet(img)['pooler_output'].reshape(1, -1)
      output_2 = resnet(suspect_img)['pooler_output'].reshape(1, -1)
      
      combined_output = combine_function(output_1, output_2)
      
      output= classifier(combined_output)
      y_pred = nn.Sigmoid()(output)
      submission_csv.loc[i, "resnet_cls"] = y_pred.item()
        

# **Using Features extracted by Vision Transformer for Reidentification**
Source: https://huggingface.co/google/vit-base-patch16-224

In [None]:
model, processor = load_feature_extractor("vit")
classifier = NN_Classifier(768 * 6, 2048, 1).to(device)
classifier.load_state_dict(torch.load(ROOT + "/vit_best.pt")['model_state_dict'])
classifier.eval()

NN_Classifier(
  (linear_relu_stack): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=3072, out_features=1024, bias=True)
    (2): GELU(approximate='none')
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=1024, out_features=1, bias=True)
  )
)

In [None]:
from tqdm import tqdm

image_path = "/content/Datasets/images/test"
suspect_path = "/content/Datasets/images/suspects"

submission_csv['vit_cls'] = None

for i in tqdm(range(len(submission_csv))):
    
  # Cropped Toy
  img = cv2.imread(image_path + f"/{submission_csv['Image_ID'][i]}.png") 
  img_h, img_w = img.shape[:2]

  bb = submission_csv.iloc[i]
  
  tl = (int(bb["xmin"] * img_w), int(bb["ymin"] * img_h))
  br = (int(bb["xmax"] * img_w), int(bb["ymax"] * img_h))

  cropped_img = img[tl[1]:br[1], tl[0]:br[0]]
  img = processor(BGR2RGB()(cropped_img), return_tensors = "pt")['pixel_values'][0].unsqueeze(0).to(device)  

  # Suspect
  suspect_img = cv2.imread(suspect_path + f"/{submission_csv['Image_ID'][i]}.png") 
  suspect_img = processor(BGR2RGB()(suspect_img), return_tensors = "pt")['pixel_values'][0].reshape(1, 3, 224, 224).to(device) 

  with torch.no_grad():
      output_1 = model(img)['pooler_output'].reshape(1, -1)
      output_2 = model(suspect_img)['pooler_output'].reshape(1, -1)
      
      combined_output = combine_function(output_1, output_2)
      
      output= classifier(combined_output)
      y_pred = nn.Sigmoid()(output)
      submission_csv.loc[i, "vit_cls"] = y_pred.item()    

100%|██████████| 3474/3474 [03:26<00:00, 16.86it/s]


# **Using Features extracted by SENet for Reidentification**
Source: https://huggingface.co/docs/timm/models/se-resnet

In [None]:
model, processor = load_feature_extractor("senet")
classifier = NN_Classifier(2048 * 6, 2048, 1).to(device)
classifier.load_state_dict(torch.load(ROOT + "/senet_best.pt")['model_state_dict'])
classifier.eval()

In [None]:
from tqdm import tqdm
import numpy as np

image_path = "/content/Datasets/images/test"
suspect_path = "/content/Datasets/images/suspects"

submission_csv['senet_cls'] = None

for i in tqdm(range(len(submission_csv))):
    
  # Cropped Toy
  img = cv2.imread(image_path + f"/{submission_csv['Image_ID'][i]}.png") 
  img_h, img_w = img.shape[:2]

  bb = submission_csv.iloc[i]
  
  tl = (int(bb["xmin"] * img_w), int(bb["ymin"] * img_h))
  br = (int(bb["xmax"] * img_w), int(bb["ymax"] * img_h))

  cropped_img = img[tl[1]:br[1], tl[0]:br[0]]

  cropped_img = Image.fromarray(BGR2RGB()(cropped_img)) 
  
  img = processor(cropped_img).unsqueeze(0).to(device) 

  # Suspect
  suspect_img = Image.open(suspect_path + f"/{submission_csv['Image_ID'][i]}.png").convert('RGB')
  
  suspect_img = processor(suspect_img).unsqueeze(0).to(device)


  with torch.no_grad():
      output_1 = model(img).reshape(1, -1)
      output_2 = model(suspect_img).reshape(1, -1)
      
      combined_output = combine_function(output_1, output_2)
      
      output= classifier(combined_output)
      y_pred = nn.Sigmoid()(output)
      print(y_pred)
      submission_csv.loc[i, "senet_cls"] = y_pred.item()
    

  0%|          | 0/3474 [00:00<?, ?it/s]

tensor([[0.2123]], device='cuda:0')





# **Using Combined Features extracted by ResNet, SENet, and Vision Transformer for Reidentification**

In [26]:
resnet_model, resnet_processor = load_feature_extractor("resnet")
senet_model, senet_processor = load_feature_extractor("senet")
vit_model, vit_processor = load_feature_extractor("vit")
classifier = NN_Classifier(4864 * 6, 1024, 1).to(device)
classifier.load_state_dict(torch.load(ROOT + "/combined_best.pt")['model_state_dict'])
classifier.eval()

Some weights of the model checkpoint at microsoft/resnet-50 were not used when initializing ResNetModel: ['classifier.1.weight', 'classifier.1.bias']
- This IS expected if you are initializing ResNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ResNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NN_Classifier(
  (linear_relu_stack): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=29184, out_features=1024, bias=True)
    (2): GELU(approximate='none')
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=1024, out_features=1, bias=True)
  )
)

In [None]:
from tqdm import tqdm
import numpy as np

image_path = "/content/Datasets/images/test"
suspect_path = "/content/Datasets/images/suspects"

submission_csv['combined_cls'] = None

for i in tqdm(range(len(submission_csv))):
    
  # Cropped Toy
  img = cv2.imread(image_path + f"/{submission_csv['Image_ID'][i]}.png") 
  img_h, img_w = img.shape[:2]

  bb = submission_csv.iloc[i]
  
  tl = (int(bb["ymin"] * img_w), int(bb["xmin"] * img_h))
  br = (int(bb["ymax"] * img_w), int(bb["xmax"] * img_h))

  cropped_img = img[tl[1]:br[1], tl[0]:br[0]]


  resnet_cropped_img = resnet_processor(cropped_img).unsqueeze(0).to(device)

  vit_cropped_img = Image.fromarray(BGR2RGB()(cropped_img)) 
  vit_cropped_img = vit_processor(vit_cropped_img, return_tensors = "pt")['pixel_values'][0].unsqueeze(0).to(device)

  senet_cropped_img = Image.fromarray(BGR2RGB()(cropped_img)) 
  senet_cropped_img = senet_processor(senet_cropped_img).unsqueeze(0).to(device)

  # Suspect
  resnet_suspect_img = cv2.imread(suspect_path + f"/{submission_csv['Image_ID'][i]}.png") 
  resnet_suspect_img = resnet_processor(resnet_suspect_img).unsqueeze(0).to(device)


  vit_suspect_img = Image.open(suspect_path + f"/{submission_csv['Image_ID'][i]}.png").convert('RGB')
  vit_suspect_img = vit_processor(vit_suspect_img, return_tensors = "pt")['pixel_values'][0].unsqueeze(0).to(device)

  senet_suspect_img = Image.open(suspect_path + f"/{submission_csv['Image_ID'][i]}.png").convert('RGB')
  senet_suspect_img = senet_processor(senet_suspect_img).unsqueeze(0).to(device)  

  with torch.no_grad():

      resnet_output_1 = resnet_model(resnet_cropped_img)['pooler_output'].reshape(1,-1)
      resnet_output_2 = resnet_model(resnet_suspect_img)['pooler_output'].reshape(1,-1)

      vit_output_1 = vit_model(vit_cropped_img)['pooler_output'].reshape(1, -1)
      vit_output_2 = vit_model(vit_suspect_img)['pooler_output'].reshape(1, -1)
      
      senet_output_1 = senet_model(senet_cropped_img).reshape(1,-1)
      senet_output_2 = senet_model(senet_suspect_img).reshape(1,-1)

      output_1 = torch.concat([resnet_output_1, senet_output_1, vit_output_1], dim = -1)
      output_2 = torch.concat([resnet_output_2, senet_output_2, vit_output_2], dim = -1)
      
      combined_output = combine_function(output_1, output_2)
      
      output= classifier(combined_output)
      y_pred = nn.Sigmoid()(output)
      submission_csv.loc[i, "combined_cls"] = y_pred.item()
  
    

# **Simple Ensemble**

In [None]:
for i in range(len(submission_csv)):
    if (submission_csv["resnet_cls"][i] 
        + submission_csv["senet_cls"][i] 
        + submission_csv["vit_cls"][i]
        + 3 * submission_csv["combined_cls"][i]
       )/6 > 0.5:
        submission_csv.loc[i, "class"] = 1
    else:
        submission_csv.loc[i, "class"] = 0

In [None]:
submission_csv = submission_yolo8x.drop(columns = ["resnet_cls", "senet_cls", "vit_cls", "combined_cls"])
submission_csv.to_csv(ROOT + "/submission_combined2_311.csv", index = False)