#### Install necessary packages

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
Coll

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# %%capture
# !unzip /content/gdrive/MyDrive/archive.zip -d /content/training-dataset/

In [None]:
# %%capture
# !unzip /content/gdrive/MyDrive/test-archive.zip -d /content/testing-dataset

In [None]:
# !mv "/content/testing-dataset/development_test_data/gallery" "/content/"
# !mv "/content/testing-dataset/development_test_data/queries" "/content/"

# !mkdir "/content/original_datasets"
# !mv "/content/testing-dataset/development_test_data/gallery.csv" "/content/original_datasets/"
# !mv "/content/testing-dataset/development_test_data/queries.csv" "/content/original_datasets/"

# !rm -rf "/content/testing-dataset"

# !mv "/content/training-dataset/train.csv" "/content/original_datasets/"
# !mv "/content/training-dataset/test.csv" "/content/original_datasets/"

# !mkdir "/content/train_imgs"

In [None]:
# import os
# import shutil

# train_images = [f for f in os.listdir('/content/training-dataset/train/train')]
# for image in train_images:
#   shutil.move('/content/training-dataset/train/train/' + image, '/content/train_imgs/' + image)

# test_images = [f for f in os.listdir('/content/training-dataset/test/test')]
# for image in test_images:
#   shutil.move('/content/training-dataset/test/test/' + image, '/content/train_imgs/' + image)

In [None]:
# !rm -rf "/content/training-dataset"

In [None]:
!mkdir "/content/generated_datasets"
!cp '/content/gdrive/MyDrive/train_dataset.csv' '/content/generated_datasets/'
!cp '/content/gdrive/MyDrive/train_tensors.csv' '/content/generated_datasets/'

In [None]:
%%capture
!unzip "/content/gdrive/MyDrive/gallery_tensors.zip"
!mv 'content/gallery_tensors' '/content/'

In [None]:
%%capture
!unzip "/content/gdrive/MyDrive/query_tensors.zip"
!mv 'content/query_tensors' '/content/'

In [None]:
%%capture
!unzip "/content/gdrive/MyDrive/query_tensors_cropped.zip"
!mv 'content/query_tensors_cropped' '/content/'

In [None]:
!rm -rf "content"

In [None]:
%%capture
!unzip "/content/gdrive/MyDrive/train_imgs_tensors.zip"

#### Imports

#### Some useful commands

In [None]:
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from transformers import AutoImageProcessor, Dinov2Model
import os
from torch.utils.data import Dataset
from torchvision.io import read_image
import torchvision.transforms as T
import torchvision.transforms.functional as F
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn

In [None]:
# use this command to see the available RAM space
!free -g

               total        used        free      shared  buff/cache   available
Mem:              12           1           1           0           9          10
Swap:              0           0           0


In [None]:
# clear the GPU memory cache and free-up GPU memory
torch.cuda.empty_cache()

## The file and folder structure of data for model building
```
File and folder structure of Product-10K dataset

content
└───gallery
|   |   ambitious-tough-teal-from-asgard.jpg
|   |   fine-shrewd-oarfish-of-genius.jpg
|   |   bold-nickel-gecko-of-reputation.jpg
|   |   marvellous-uber-boobook-of-lightning.jpg
|   |   bouncy-economic-agama-of-honeydew.jpg
│   
└───queries
|   |   magnetic-powerful-platypus-of-hail.jpeg
|   |   free-keen-mole-of-cookies.jpeg
|   |   optimal-uptight-ringtail-of-cleaning.jpeg
|   |   bold-nickel-gecko-of-reputation.jpg
|   |   marvellous-uber-boobook-of-lightning.jpg
|
└───train_imgs
|   |   1.jpg
|   |   10.jpg
|   |   100.jpg
|   |   1000.jpg
|
└───original_datasets
|   |   gallery.csv
|   |   queries.csv
|   |   train.csv
|   |   test.csv
|
└───generated_datasets
|   |   train_dataset.csv
|
```

In [None]:
model_ckpt = "facebook/dinov2-giant"
image_processor = AutoImageProcessor.from_pretrained(model_ckpt,
                                                     do_normalize=True,
                                                     do_center_crop=True,
                                                     do_rescale=True,
                                                     do_resize=True,
                                                     size={'shortest_edge': 384},
                                                     crop_size={'height':384, 'width':384})
model = Dinov2Model.from_pretrained(model_ckpt)
hidden_dim = model.config.hidden_size

Downloading (…)rocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

In [None]:
model

Dinov2Model(
  (embeddings): Dinov2Embeddings(
    (patch_embeddings): Dinov2PatchEmbeddings(
      (projection): Conv2d(3, 1536, kernel_size=(14, 14), stride=(14, 14))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): Dinov2Encoder(
    (layer): ModuleList(
      (0-39): 40 x Dinov2Layer(
        (norm1): LayerNorm((1536,), eps=1e-06, elementwise_affine=True)
        (attention): Dinov2Attention(
          (attention): Dinov2SelfAttention(
            (query): Linear(in_features=1536, out_features=1536, bias=True)
            (key): Linear(in_features=1536, out_features=1536, bias=True)
            (value): Linear(in_features=1536, out_features=1536, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): Dinov2SelfOutput(
            (dense): Linear(in_features=1536, out_features=1536, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (layer_scale1): Dinov2LayerScale()
        (drop_

In [None]:
image_processor

BitImageProcessor {
  "crop_size": {
    "height": 384,
    "width": 384
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "BitImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 384
  }
}

In [None]:
model = model.to('cuda')

In [None]:
class CustomDataset(Dataset):
    def __init__(self, annotations_file, img_dir, indices, transform=None, target_transform=None, crop_to_bbox=False):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.indices = indices
        self.transform = transform
        self.target_transform = target_transform
        self.crop_to_bbox = crop_to_bbox

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, self.indices[0]])
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, self.indices[1]]
        if self.transform:
            if not self.crop_to_bbox:
              image = self.transform(image)
            else:
              x = self.img_labels.iloc[idx, self.indices[2]]
              y = self.img_labels.iloc[idx, self.indices[3]]
              w = self.img_labels.iloc[idx, self.indices[4]]
              h = self.img_labels.iloc[idx, self.indices[5]]
              image = self.transform(image, x, y, w, h)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label, img_path

def image_transformation(image):
    images = F.resize(image, (384, 384), antialias=True)
    images = image_processor(images, return_tensors="pt")
    return images

def image_transform_with_bbox_crop(image, x, y, w, h):
    images = F.crop(image, y, x, h, w)
    images = F.resize(images, (384, 384), antialias=True)
    images = image_processor(images, return_tensors="pt")
    return images

In [None]:
# gallery = pd.read_csv("original_datasets/gallery.csv")
# gallery.head()

In [None]:
# test image preprocessing
# proc = AutoImageProcessor.from_pretrained(model_ckpt,
#                                           do_normalize=True,
#                                           do_center_crop=True,
#                                           do_rescale=True,
#                                           do_resize=True,
#                                           size={'shortest_edge': 384},
#                                           crop_size={'height':384, 'width':384})
# img = read_image(gallery.iloc[0, 1])
# img_ = F.to_pil_image(F.resize(img, (384, 384), antialias=True))
# plt.imshow(np.asarray(img_))

In [None]:
# proc_img = proc(img_, return_tensors="pt")
# proc_img = torch.squeeze(proc_img['pixel_values'], dim=0)
# print(proc_img.shape)
# proc_img = F.to_pil_image(proc_img)
# plt.imshow(np.asarray(proc_img))

### Generate embeddings for gallery images and save

In [None]:
# gallery_dataset = CustomDataset("original_datasets/gallery.csv", "", [1, 2], transform=image_transformation)
# gallery_dataloader = DataLoader(gallery_dataset, batch_size=32, shuffle=True)

# df = pd.DataFrame(columns=['tensor_path', 'label', 'image_path'])

# if not os.path.exists("gallery_tensors"):
#     os.makedirs("gallery_tensors")

# for idx, data in enumerate(tqdm(gallery_dataloader)):
#     images, labels, img_paths = data
#     images = images['pixel_values'].to('cuda')
#     images = torch.squeeze(images, dim=1)

#     with torch.no_grad():
#         output = model(images)
#         embeddings = output.last_hidden_state[:, 0].cpu()

#     for i in range(embeddings.shape[0]):
#         file_path = f'gallery_tensors/{idx}_{i}.pt'
#         torch.save(embeddings[i], file_path)
#         df.loc[len(df)] = [file_path, labels[i].item(), img_paths[i]]

# df.reset_index(inplace = True)
# df.to_csv('gallery_tensors/gallery_tensors.csv', index=False)

In [None]:
class TensorDataset(Dataset):
    def __init__(self, annotations_file, tensor_dir, transform=None, target_transform=None):
        self.tensor_labels = pd.read_csv(annotations_file)
        self.tensor_dir = tensor_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.tensor_labels)

    def __getitem__(self, idx):
        tensor_path = os.path.join(self.tensor_dir, self.tensor_labels.iloc[idx, 1])
        tensor = torch.load(tensor_path)
        label = self.tensor_labels.iloc[idx, 2]
        image_path = self.tensor_labels.iloc[idx, 3]
        if self.transform:
            tensor = self.transform(tensor)
        if self.target_transform:
            label = self.target_transform(label)
        return tensor, label, image_path

In [None]:
gallery_tensors = pd.read_csv("gallery_tensors/gallery_tensors.csv")
gallery_tensors.head()

Unnamed: 0,index,tensor_path,label,image_path
0,0,gallery_tensors/0_0.pt,3590,gallery/cautious-adamant-termite-of-diversity.jpg
1,1,gallery_tensors/0_1.pt,5250,gallery/fresh-crafty-aardwolf-of-devotion.jpg
2,2,gallery_tensors/0_2.pt,5061,gallery/determined-light-marmot-of-examination...
3,3,gallery_tensors/0_3.pt,5225,gallery/jumping-flat-vole-from-wonderland.jpg
4,4,gallery_tensors/0_4.pt,5006,gallery/logical-didactic-skylark-from-hyperbor...


In [None]:
tensor_dataset = TensorDataset("gallery_tensors/gallery_tensors.csv", "")
tensor_dataloader = DataLoader(tensor_dataset, batch_size=32, shuffle=True)

tensor_dict = {}
label_dict = {}
image_path_dict = {}

for idx, data in enumerate(tqdm(tensor_dataloader)):
    tensors, labels, image_paths = data
    tensor_dict[idx] = tensors
    label_dict[idx] = labels
    image_path_dict[idx] = image_paths

100%|██████████| 34/34 [00:03<00:00, 10.30it/s]


In [None]:
# query = pd.read_csv("original_datasets/queries.csv")
# query.head()

### Generate embeddings for query images(without cropping to bounding box) and save

In [None]:
# query_dataset = CustomDataset("original_datasets/queries.csv", "", [1, 6], transform=image_transformation)
# query_dataloader = DataLoader(query_dataset, batch_size=32, shuffle=False)

# df = pd.DataFrame(columns=['tensor_path', 'label', 'image_path'])

# if not os.path.exists("query_tensors"):
#     os.makedirs("query_tensors")

# for idx, data in enumerate(tqdm(query_dataloader)):
#     images, labels, img_paths = data
#     images = images['pixel_values'].to('cuda')
#     images = torch.squeeze(images, dim=1)

#     with torch.no_grad():
#         output = model(images)
#         embeddings = output.last_hidden_state[:, 0].cpu()

#     for i in range(embeddings.shape[0]):
#         file_path = f'query_tensors/{idx}_{i}.pt'
#         torch.save(embeddings[i], file_path)
#         df.loc[len(df)] = [file_path, labels[i].item(), img_paths[i]]

# df.reset_index(inplace = True)
# df.to_csv('query_tensors/query_tensors.csv', index=False)

In [None]:
query_tensors = pd.read_csv("query_tensors/query_tensors.csv")
query_tensors.head()

Unnamed: 0,index,tensor_path,label,image_path
0,0,query_tensors/0_0.pt,5013,queries/magnetic-powerful-platypus-of-hail.jpeg
1,1,query_tensors/0_1.pt,3550,queries/free-keen-mole-of-cookies.jpeg
2,2,query_tensors/0_2.pt,71,queries/hilarious-precious-parakeet-of-adverti...
3,3,query_tensors/0_3.pt,3566,queries/optimal-uptight-ringtail-of-cleaning.jpeg
4,4,query_tensors/0_4.pt,5329,queries/nonchalant-impala-of-fabulous-artistry...


### Generate embeddings for query images(with cropping to bounding box) and save

In [None]:
# query_dataset = CustomDataset("original_datasets/queries.csv", "", [1, 6, 2, 3, 4, 5], transform=image_transform_with_bbox_crop, crop_to_bbox=True)
# query_dataloader = DataLoader(query_dataset, batch_size=32, shuffle=False)

# df = pd.DataFrame(columns=['tensor_path', 'label', 'image_path'])

# if not os.path.exists("query_tensors_cropped"):
#     os.makedirs("query_tensors_cropped")

# for idx, data in enumerate(tqdm(query_dataloader)):
#     images, labels, img_paths = data
#     images = images['pixel_values'].to('cuda')
#     images = torch.squeeze(images, dim=1)

#     with torch.no_grad():
#         output = model(images)
#         embeddings = output.last_hidden_state[:, 0].cpu()

#     for i in range(embeddings.shape[0]):
#         file_path = f'query_tensors_cropped/{idx}_{i}.pt'
#         torch.save(embeddings[i], file_path)
#         df.loc[len(df)] = [file_path, labels[i].item(), img_paths[i]]

# df.reset_index(inplace = True)
# df.to_csv('query_tensors_cropped/query_tensors.csv', index=False)

In [None]:
cropped_query_tensors = pd.read_csv("query_tensors_cropped/query_tensors.csv")
cropped_query_tensors.head()

Unnamed: 0,index,tensor_path,label,image_path
0,0,query_tensors_cropped/0_0.pt,5013,queries/magnetic-powerful-platypus-of-hail.jpeg
1,1,query_tensors_cropped/0_1.pt,3550,queries/free-keen-mole-of-cookies.jpeg
2,2,query_tensors_cropped/0_2.pt,71,queries/hilarious-precious-parakeet-of-adverti...
3,3,query_tensors_cropped/0_3.pt,3566,queries/optimal-uptight-ringtail-of-cleaning.jpeg
4,4,query_tensors_cropped/0_4.pt,5329,queries/nonchalant-impala-of-fabulous-artistry...


### Load the saved query image embedding tensors(non-cropped)

In [None]:
query_tensor_dataset = TensorDataset("query_tensors/query_tensors.csv", "")
query_tensor_dataloader = DataLoader(query_tensor_dataset, batch_size=1, shuffle=False)

query_tensor_dict = {}
query_label_dict = {}
query_image_path_dict = {}

for idx, data in enumerate(tqdm(query_tensor_dataloader)):
    tensors, labels, image_paths = data
    query_tensor_dict[idx] = tensors
    query_label_dict[idx] = labels
    query_image_path_dict[idx] = image_paths

100%|██████████| 1935/1935 [00:06<00:00, 313.48it/s]


### Load the saved query image embedding tensors(cropped)

In [None]:
crop_query_tensor_dataset = TensorDataset("query_tensors_cropped/query_tensors.csv", "")
crop_query_tensor_dataloader = DataLoader(crop_query_tensor_dataset, batch_size=1, shuffle=False)

crop_query_tensor_dict = {}
crop_query_label_dict = {}
crop_query_image_path_dict = {}

for idx, data in enumerate(tqdm(crop_query_tensor_dataloader)):
    tensors, labels, image_paths = data
    crop_query_tensor_dict[idx] = tensors
    crop_query_label_dict[idx] = labels
    crop_query_image_path_dict[idx] = image_paths

100%|██████████| 1935/1935 [00:06<00:00, 303.29it/s]


In [None]:
batch_size = 32

def AP(actual_label, predicted_labels):
    GPTs = predicted_labels.count(actual_label)
    if GPTs < 1:
        return 1
    AP = 0
    GPTs_found = 0
    for i in range(len(predicted_labels)):
        if predicted_labels[i] == actual_label:
            GPTs_found += 1
            AP += GPTs_found / (i + 1)
    return AP / GPTs

def mAP(APs):
    return np.average(APs)

def zero_shot_model(query_tensor_dictionary, query_label_dictionary):
    APs_euclidean = []
    APs_manhattan = []
    APs_cosine = []
    for i in tqdm(range(len(query_tensor_dictionary))):
        query_tensor_ = query_tensor_dictionary[i].to('cuda')
        query_tensor = query_tensor_.repeat(batch_size, 1)

        euclidean = None
        manhattan = None
        cosine = None
        pred_labels = None
        act_label = query_label_dictionary[i]
        for k in range(len(tensor_dict)):
            if tensor_dict[k].shape[0] < batch_size:
                query_tensor = query_tensor_.repeat(tensor_dict[k].shape[0], 1)
            gallery_tensors = tensor_dict[k].to('cuda')
            gallery_labels = label_dict[k].to('cuda')
            if euclidean == None and manhattan == None and cosine == None:
                euclidean = torch.nn.functional.pairwise_distance(gallery_tensors, query_tensor, p=2)
                manhattan = torch.nn.functional.pairwise_distance(gallery_tensors, query_tensor, p=1)
                cosine = torch.nn.functional.cosine_similarity(gallery_tensors, query_tensor)
                pred_labels = gallery_labels
            else:
                euclidean = torch.cat((euclidean, torch.nn.functional.pairwise_distance(gallery_tensors, query_tensor, p=2)), dim=0)
                manhattan = torch.cat((manhattan, torch.nn.functional.pairwise_distance(gallery_tensors, query_tensor, p=1)), dim=0)
                cosine = torch.cat((cosine, torch.nn.functional.cosine_similarity(gallery_tensors, query_tensor)), dim=0)
                pred_labels = torch.cat((pred_labels, gallery_labels), dim=0)

        euclidean_, euclidean_indices = torch.sort(euclidean)
        manhattan_, manhattan_indices = torch.sort(manhattan)
        cosine_, cosine_indices = torch.sort(torch.abs(cosine), descending=True)

        pred_labels_euclidean = pred_labels[euclidean_indices]
        pred_labels_manhattan = pred_labels[manhattan_indices]
        pred_labels_cosine = pred_labels[cosine_indices]

        APs_euclidean.append(AP(act_label.item(), pred_labels_euclidean.cpu().numpy().astype(int).tolist()))
        APs_manhattan.append(AP(act_label.item(), pred_labels_manhattan.cpu().numpy().astype(int).tolist()))
        APs_cosine.append(AP(act_label.item(), pred_labels_cosine.cpu().numpy().astype(int).tolist()))

    print('\nEuclidean distance mAP:', mAP(APs_euclidean))
    print('Manhattan distance mAP:', mAP(APs_manhattan))
    print('Cosine distance mAP:', mAP(APs_cosine))

In [None]:
zero_shot_model(query_tensor_dict, query_label_dict)

100%|██████████| 1935/1935 [00:35<00:00, 54.88it/s]


Euclidean distance mAP: 0.17185104739425894
Manhattan distance mAP: 0.17276926946274335
Cosine distance mAP: 0.17374990262707413





In [None]:
zero_shot_model(crop_query_tensor_dict, crop_query_label_dict)

100%|██████████| 1935/1935 [00:32<00:00, 59.81it/s]


Euclidean distance mAP: 0.18045948404926768
Manhattan distance mAP: 0.18160515003059668
Cosine distance mAP: 0.18212984864232812





### Save the embeddings for future use as Zip

In [None]:
# %%capture
# !zip -r gallery_tensors.zip '/content/gallery_tensors/'

In [None]:
# %%capture
# !zip -r query_tensors.zip '/content/query_tensors/'

In [None]:
# %%capture
# !zip -r query_tensors_cropped.zip '/content/query_tensors_cropped/'

In [None]:
# !cp '/content/gallery_tensors.zip' '/content/gdrive/MyDrive/gallery_tensors.zip'
# !cp '/content/query_tensors.zip' '/content/gdrive/MyDrive/query_tensors.zip'
# !cp '/content/query_tensors_cropped.zip' '/content/gdrive/MyDrive/query_tensors_cropped.zip'

# Now let's train a model with only one linear layer

In [None]:
# train_dataset = pd.read_csv('/content/generated_datasets/train_dataset.csv', low_memory=False)
# print(train_dataset.shape)
# display(train_dataset.head())

In [None]:
# list_of_images = train_dataset['img1'].unique().tolist() + train_dataset['img2'].unique().tolist()
# unique_images_list = list(set(list_of_images))
# print("Count of unique images:", len(unique_images_list))
# print(unique_images_list[0:10])

In [None]:
# import os
# from torch.utils.data import Dataset
# from torchvision.io import read_image
# import torchvision.transforms as T
# import torchvision.transforms.functional as F

# class CustomTrainDataset(Dataset):
#     def __init__(self, image_paths, img_dir, transform=None):
#         self.img_paths = image_paths
#         self.img_dir = img_dir
#         self.transform = transform

#     def __len__(self):
#         return len(self.img_paths)

#     def __getitem__(self, idx):
#         img_path = os.path.join(self.img_dir, self.img_paths[idx])
#         image = read_image(img_path)
#         if self.transform:
#             image = self.transform(image)
#         return image, self.img_paths[idx], img_path

# def image_transformation(image):
#     images = F.resize(image, (384, 384), antialias=True)
#     images = image_processor(images, return_tensors="pt")
#     return images

# from tqdm import tqdm
# from torch.utils.data import DataLoader

# train_image_dataset = CustomTrainDataset(unique_images_list, "/content/train_imgs", transform=image_transformation)
# train_image_dataloader = DataLoader(train_image_dataset, batch_size=32, shuffle=True)

# df = pd.DataFrame(columns=['tensor_path', 'img_name', 'image_path'])

# if not os.path.exists("train_imgs_tensors"):
#     os.makedirs("train_imgs_tensors")

# for idx, data in enumerate(tqdm(train_image_dataloader)):
#     images, img_names, img_paths = data
#     images = images['pixel_values'].to('cuda')
#     images = torch.squeeze(images, dim=1)

#     with torch.no_grad():
#         output = model(images)
#         embeddings = output.last_hidden_state[:, 0].cpu()

#     for i in range(embeddings.shape[0]):
#         file_path = f'train_imgs_tensors/{idx}_{i}.pt'
#         torch.save(embeddings[i], file_path)
#         df.loc[len(df)] = [file_path, img_names[i], img_paths[i]]

# df.reset_index(inplace = True)
# df.to_csv('train_imgs_tensors/train_imgs_tensors.csv', index=False)

In [None]:
# df = train_dataset.copy()
# df['tensor1'] = np.nan
# df['tensor2'] = np.nan

# train_imgs_tensors_dataset = pd.read_csv('train_imgs_tensors/train_imgs_tensors.csv', low_memory=False)

# for index, row in tqdm(train_imgs_tensors_dataset.iterrows()):
#   df.loc[df['img1'] == row['img_name'], 'tensor1'] = row['tensor_path']
#   df.loc[df['img2'] == row['img_name'], 'tensor2'] = row['tensor_path']

# list_of_tensors = df['tensor1'].unique().tolist() + df['tensor2'].unique().tolist()
# unique_tensor_list = list(set(list_of_tensors))
# print("Count of unique images:", len(unique_tensor_list))
# print(unique_tensor_list[0:10])

# null_count = df.isnull().sum().sum()
# print('Number of null values:', null_count)

# df.reset_index(inplace = True)
# df.to_csv('train_imgs_tensors/train_tensors.csv', index=False)

In [None]:
# train_tensors_dataset_ = pd.read_csv('train_imgs_tensors/train_tensors.csv', low_memory=False)
# train_tensors_dataset_.reset_index(drop=True, inplace = True)
# train_tensors_dataset_.drop(['index', 'Unnamed: 0'], inplace=True, axis=1)
# train_tensors_dataset_.to_csv('generated_datasets/train_tensors.csv', index=False)

In [None]:
train_tensors_dataset_ = pd.read_csv('generated_datasets/train_tensors.csv', low_memory=False)
train_tensors_dataset_.head()

Unnamed: 0,img1,img2,label,tensor1,tensor2
0,14346.jpg,7211589.jpg,0,train_imgs_tensors/3815_25.pt,train_imgs_tensors/3443_0.pt
1,77676.jpg,1671857.jpg,0,train_imgs_tensors/3400_25.pt,train_imgs_tensors/840_20.pt
2,53031.jpg,53037.jpg,1,train_imgs_tensors/3108_26.pt,train_imgs_tensors/1917_28.pt
3,84577.jpg,84579.jpg,1,train_imgs_tensors/28_17.pt,train_imgs_tensors/689_31.pt
4,5622713.jpg,8209261.jpg,0,train_imgs_tensors/2561_11.pt,train_imgs_tensors/2549_12.pt


In [None]:
# !cp '/content/generated_datasets/train_tensors.csv' '/content/gdrive/MyDrive/train_tensors.csv'

In [None]:
class TensorTrainDataset(Dataset):
    def __init__(self, annotations_file, tensor_dir, transform=None, target_transform=None):
        self.tensor_labels = pd.read_csv(annotations_file, low_memory=False)
        self.tensor_dir = tensor_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.tensor_labels)

    def __getitem__(self, idx):
        tensor1_path = os.path.join(self.tensor_dir, self.tensor_labels.iloc[idx, 3])
        tensor2_path = os.path.join(self.tensor_dir, self.tensor_labels.iloc[idx, 4])
        tensor1 = torch.load(tensor1_path)
        tensor2 = torch.load(tensor2_path)
        label = self.tensor_labels.iloc[idx, 2]
        if self.transform:
            tensor1 = self.transform(tensor1)
            tensor2 = self.transform(tensor2)
        if self.target_transform:
            label = self.target_transform(label)
        return tensor1, tensor2, label

In [None]:
class ImageDistanceModel(nn.Module):
    def __init__(self):
        super(ImageDistanceModel, self).__init__()
        self.linear1 = nn.Linear(1536, 1536)
        self.linear1.weight.data.copy_(torch.eye(1536))
        self.linear1.bias.data.fill_(0)
        self.linear2 = nn.Linear(1536, 1536)
        self.linear2.weight.data.copy_(torch.eye(1536))
        self.linear2.bias.data.fill_(0)
        self.dist1 = nn.CosineSimilarity()
        self.dist2 = nn.CosineSimilarity()

    def forward(self, tensor1, tensor2):
        emb1 = self.linear1(tensor1)
        emb2 = self.linear2(tensor2)
        score1 = torch.abs(self.dist1(emb1, tensor2))
        score2 = torch.abs(self.dist2(emb2, tensor1))
        distance = torch.div(torch.add(score1, score2), 2)
        return distance

In [None]:
class CosineContrastiveLoss(nn.Module):
    def __init__(self):
        super(CosineContrastiveLoss, self).__init__()

    def forward(self, dist, label, margin: float = 1.0):
        # label = 1 -> similar, label = 0 -> dissimilar
        dist = 1 - dist # in cosine distance 0 means large distance and 1 means low distance
        loss = label * dist + (1 - label) * torch.clamp(margin - dist, min=0.0)
        loss = torch.mean(loss)
        return loss

In [None]:
ll_model = ImageDistanceModel()
ll_model = ll_model.to('cuda')

train_tensor_dataset = TensorTrainDataset("generated_datasets/train_tensors.csv", "")
train_tensor_dataloader = DataLoader(train_tensor_dataset, batch_size=16, shuffle=False)

# Optimizers specified in the torch.optim package
optimizer = torch.optim.SGD(ll_model.parameters(), lr=0.01, momentum=0.9)

# Define loss function
loss_fn = CosineContrastiveLoss()

def train_one_epoch(model):
    train_loss = 0
    num_batches = 0

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(tqdm(train_tensor_dataloader)):
        # Every data instance is an input + label pair
        tensor1, tensor2, labels = data
        tensor1, tensor2, labels = tensor1.to('cuda'), tensor2.to('cuda'), labels.to('cuda')

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(tensor1, tensor2)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data to calculate the training loss
        train_loss += loss.item()
        num_batches += 1

    return train_loss / num_batches

In [None]:
def validate(query_tensor_dictionary, query_label_dictionary, model, loss_fn):
    APs_cosine = []
    for i in tqdm(range(len(query_tensor_dictionary))):
        query_tensor_ = query_tensor_dictionary[i].to('cuda')
        act_label = query_label_dictionary[i].to('cuda')

        cosine = None
        pred_labels = None
        for k in range(len(tensor_dict)):
            num_gallery_tensors = tensor_dict[k].shape[0]
            query_tensor = query_tensor_.repeat(num_gallery_tensors, 1)

            gallery_tensors = tensor_dict[k].to('cuda')
            gallery_labels = label_dict[k].to('cuda')

            if cosine == None:
                cosine = model(query_tensor, gallery_tensors)
                pred_labels = gallery_labels
            else:
                cosine = torch.cat((cosine, model(query_tensor, gallery_tensors)), dim=0)
                pred_labels = torch.cat((pred_labels, gallery_labels), dim=0)

        # calculate contrastive loss
        label = torch.eq(pred_labels, act_label).int()
        loss = loss_fn(cosine, label)

        # calculate mAP value
        cosine_, cosine_indices = torch.sort(torch.abs(cosine), descending=True)
        pred_labels_cosine = pred_labels[cosine_indices]
        APs_cosine.append(AP(act_label.item(), pred_labels_cosine.cpu().numpy().astype(int).tolist()))
    return loss, mAP(APs_cosine)

In [None]:
!mkdir "/content/saved_models"

mkdir: cannot create directory ‘/content/saved_models’: File exists


In [None]:
experiment_name = 'exp1'

epoch_number = 0
EPOCHS = 10

best_vloss = 10000
best_vloss_epoch = 0
best_model_val_loss = None

best_mAP = 0
best_mAP_epoch = 0
best_model_mAP = None

for epoch in range(EPOCHS):
    print('EPOCH {}: \nTraining...'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    ll_model.train(True)
    train_loss = train_one_epoch(ll_model)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    ll_model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        print('Validating...')
        val_loss, mAP_score = validate(crop_query_tensor_dict, crop_query_label_dict, ll_model, loss_fn)

    print('LOSS train {} valid {} \nEVAL METRIC mAP: {}\n'.format(train_loss, val_loss, mAP_score))

    # Track best performance, and save the model's state
    if val_loss < best_vloss:
        best_vloss = val_loss
        best_vloss_epoch = epoch_number + 1
        best_model_val_loss = ll_model.state_dict()

    if mAP_score > best_mAP:
        best_mAP = mAP_score
        best_mAP_epoch = epoch_number + 1
        best_model_mAP = ll_model.state_dict()

    epoch_number += 1

model_path = '/content/saved_models/model_with_best_val_loss_{}_{}'.format(experiment_name, best_vloss_epoch)
torch.save(best_model_val_loss, model_path)

model_path = '/content/saved_models/model_with_best_mAP_{}_{}'.format(experiment_name, best_mAP_epoch)
torch.save(best_model_mAP, model_path)

EPOCH 1: 
Training...


100%|██████████| 12720/12720 [24:09<00:00,  8.77it/s]


Validating...


100%|██████████| 1935/1935 [00:52<00:00, 37.06it/s]


LOSS train 0.13323810024643845 valid 0.23063623905181885 
EVAL METRIC mAP: 0.17801866473841246

EPOCH 2: 
Training...


100%|██████████| 12720/12720 [24:07<00:00,  8.79it/s]


Validating...


100%|██████████| 1935/1935 [00:55<00:00, 34.62it/s]


LOSS train 0.12407697798719383 valid 0.23087017238140106 
EVAL METRIC mAP: 0.17739546583148247

EPOCH 3: 
Training...


100%|██████████| 12720/12720 [24:16<00:00,  8.73it/s]


Validating...


100%|██████████| 1935/1935 [00:55<00:00, 35.15it/s]


LOSS train 0.12125375998651597 valid 0.23293928802013397 
EVAL METRIC mAP: 0.180345153690817

EPOCH 4: 
Training...


100%|██████████| 12720/12720 [24:22<00:00,  8.70it/s]


Validating...


100%|██████████| 1935/1935 [00:55<00:00, 35.14it/s]


LOSS train 0.1196804084823871 valid 0.232998788356781 
EVAL METRIC mAP: 0.18151582636503663

EPOCH 5: 
Training...


 93%|█████████▎| 11885/12720 [22:44<01:35,  8.71it/s]


KeyboardInterrupt: ignored