In [1]:
# #Setup Gdrive file download extention
!pip install gdown open-clip-torch faiss-gpu

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Collecting open-clip-torch
  Downloading open_clip_torch-2.22.0-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-gpu, ftfy, gdown, open-clip-torch
Successfully installed faiss-gpu-1.7.2 ftfy-6.1.1 gdown-4.7.1 open-clip-torch-2.22.0


In [2]:
# !gdown --id 1BFAJfzzeaUGsPoYELS86HIutJ43D-vat

In [3]:
import os
import math

import numpy as np
 
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import open_clip

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torchvision.transforms as T

from tqdm import tqdm
import pandas as pd
import cv2
from PIL import Image
import time
import faiss
import copy
import argparse

In [4]:
def compute_precision_at_k(ranked_targets: np.ndarray,
                           k: int) -> float:

    """
    Computes the precision at k.
    Args:
        ranked_targets: A boolean array of retrieved targets, True if relevant and False otherwise.
        k: The number of examples to consider

    Returns: The precision at k
    """
    assert k >= 1
    assert ranked_targets.size >= k, ValueError('Relevance score length < k')
    return np.mean(ranked_targets[:k])

def compute_average_precision(ranked_targets: np.ndarray,
                              gtp: int) -> float:
    
        
    """
    Computes the average precision.
    Args:
        ranked_targets: A boolean array of retrieved targets, True if relevant and False otherwise.
        gtp: ground truth positives.

    Returns:
        The average precision.
    """
    assert gtp >= 1
    # compute precision at rank only for positive targets
    out = [compute_precision_at_k(ranked_targets, k + 1) for k in range(ranked_targets.size) if ranked_targets[k]]
    if len(out) == 0:
        # no relevant targets in top1000 results
        return 0.0
    else:
        return np.sum(out) / gtp


def calculate_map(ranked_retrieval_results: np.ndarray,
                  query_labels: np.ndarray,
                  gallery_labels: np.ndarray) -> float:
    
    global current_retrievals, gpt
    
    """
    Calculates the mean average precision.
    Args:
        ranked_retrieval_results: A 2D array of ranked retrieval results (shape: n_queries x 1000), because we use
                                top1000 retrieval results.
        query_labels: A 1D array of query class labels (shape: n_queries).
        gallery_labels: A 1D array of gallery class labels (shape: n_gallery_items).
    Returns:
        The mean average precision.
    """
    assert ranked_retrieval_results.ndim == 2
    assert ranked_retrieval_results.shape[1] == 1000

    class_average_precisions = []
    current_retrievals = []

    class_ids, class_counts = np.unique(gallery_labels, return_counts=True)
    class_id2quantity_dict = dict(zip(class_ids, class_counts))
    for gallery_indices, query_class_id in tqdm(
                            zip(ranked_retrieval_results, query_labels),
                            total=len(query_labels)):
        # Checking that no image is repeated in the retrival results
        assert len(np.unique(gallery_indices)) == len(gallery_indices), \
                    ValueError('Repeated images in retrieval results')

        current_retrieval = gallery_labels[gallery_indices] == query_class_id
        gpt = class_id2quantity_dict[query_class_id]
        
        current_retrievals.append(current_retrieval)

        class_average_precisions.append(
            compute_average_precision(current_retrieval, gpt)
        )

    mean_average_precision = np.mean(class_average_precisions)
    return mean_average_precision

In [5]:
import numpy as np
import time

def calculate_l2_distances(query, gallery):
    return np.linalg.norm(gallery - query, axis=1)

def get_k_nearest_neighbors(distances, k):
    indices = np.argsort(distances)[:k]
    return indices

def get_similiarity_l2(embeddings_gallery, embeddings_query, k):
    print('Processing indices...')

    s = time.time()

    scores = []
    indices = []

    for query in embeddings_query:
        distances = calculate_l2_distances(query, embeddings_gallery)
        nearest_indices = get_k_nearest_neighbors(distances, k)
        scores.append(distances[nearest_indices])
        indices.append(nearest_indices)

    e = time.time()

    print(f'Finished processing indices, took {e - s}s')
    return np.array(scores), np.array(indices)

In [6]:
def convert_indices_to_labels(indices, labels):
    indices_copy = copy.deepcopy(indices)
    for row in indices_copy:
        for j in range(len(row)):
            row[j] = labels[row[j]]
    return indices_copy

In [7]:
device = 'cuda' if th.cuda.is_available() else 'cpu';

# Reading Dataset

In [8]:
def read_image(image_file):
    img = cv2.imread(
        image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION
    )
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    if img is None:
        raise ValueError('Failed to read {}'.format(image_file))
    return img

class SubmissionDataset(Dataset):
    def __init__(self, root, annotation_file, transforms, with_bbox=False):
        self.root = root
        self.imlist = pd.read_csv(annotation_file)
        self.transforms = transforms
        self.with_bbox = with_bbox

    def __getitem__(self, index):
        cv2.setNumThreads(6)

        full_imname = os.path.join(self.root, self.imlist['img_path'][index])
        img = read_image(full_imname)

        if self.with_bbox:
            x, y, w, h = self.imlist.loc[index, 'bbox_x':'bbox_h']
            img = img[y:y+h, x:x+w, :]

        img = Image.fromarray(img)
        img = self.transforms(img)
        product_id = self.imlist['product_id'][index]
        return img, product_id

    def __len__(self):
        return len(self.imlist)

In [9]:
def get_transform():  
    transform = T.Compose([
            T.Resize(
                size=(224, 224), 
                interpolation=T.InterpolationMode.BICUBIC,
                antialias=True),
            T.ToTensor(), 
            T.Normalize(
                mean=(0.48145466, 0.4578275, 0.40821073), 
                std=(0.26862954, 0.26130258, 0.27577711)
            )
        ])
    return transform

@th.no_grad()
def get_feature_vector(model_vit_h_14, model_vit_l_14, model_convnext_320_focal, dataloader, weight, use_cuda=True):
    features = []
    product_id = []
    
    for imgs, p_id in tqdm(dataloader):
        if use_cuda:
            imgs = imgs.cuda()
            
#         print(th.squeeze(model_vit_h_14(imgs.half())).shape, th.squeeze(model_vit_l_14(imgs.half())).shape)
        feature_model_vit_h_14, feature_model_vit_l_14, feature_model_convnext_320_focal = th.squeeze(model_vit_h_14(imgs.half())).detach().cpu().numpy().astype(np.float32), th.squeeze(model_vit_l_14(imgs.half())).detach().cpu().numpy().astype(np.float32), th.squeeze(model_convnext_320_focal(imgs.half())).detach().cpu().numpy().astype(np.float32)
        
        feature_model_vit_h_14 = th.from_numpy(feature_model_vit_h_14)
        feature_model_vit_l_14 = th.from_numpy(weight[1]*feature_model_vit_l_14)
        feature_model_convnext_320_focal = th.from_numpy(weight[0]*feature_model_convnext_320_focal)
        
        feature = th.cat((feature_model_vit_h_14, feature_model_vit_l_14, feature_model_convnext_320_focal), dim=1)
#         print(feature.shape)
        features.append(feature)
        product_id.append(th.squeeze(p_id).detach().cpu().numpy())

    return np.concatenate(features, axis=0), np.concatenate(product_id)

# Getting the CLIP model's embedding

In [10]:
weights_path_large = '/kaggle/input/vit-l-14-final-weights/my_experiments/ViT-L-14-laion2b_s32b_b82k-image_net-v2-p10k-h&m-Arcface(k=3)-All-Epoch(4)-Reduce_LR_0.1/model_epoch_2_mAP3_0.52.pt'

vit_backbone_vit_l_14 = open_clip.create_model_and_transforms('ViT-L-14', None)[0].visual
vit_backbone_vit_l_14.load_state_dict(th.load(weights_path_large)['model_state_dict'])
vit_backbone_vit_l_14.half()   # Apply half precision to the backbone model
vit_backbone_vit_l_14.eval()   # Dropping unecessary layers
model_vit_l_14 = vit_backbone_vit_l_14
model_vit_l_14.cuda();

In [11]:
weights_path_huge = '/kaggle/input/vit-h-14-final-weights/model_weights.pt'

vit_backbone_vit_h_14 = open_clip.create_model_and_transforms('ViT-H-14', None)[0].visual
vit_backbone_vit_h_14.load_state_dict(th.load(weights_path_huge))
vit_backbone_vit_h_14.half()   # Apply half precision to the backbone model
vit_backbone_vit_h_14.eval()   # Dropping unecessary layers
model_vit_h_14 = vit_backbone_vit_h_14
model_vit_h_14.cuda();

In [12]:
weights_path_convnext_320_focal = '/kaggle/input/convnext-large-d-final-weights/my_experiments/convnext_large_d_320-laion2b_s29b_b131k_ft-image_net-v2-p10k-h&m-Arcface(k=3)-All-Epoch(2)-Reduce_LR_0.1/model_epoch_1_mAP3_0.55.pt'

vit_backbone_convnext_320_focal = open_clip.create_model_and_transforms('convnext_large_d_320', None)[0].visual
vit_backbone_convnext_320_focal.load_state_dict(th.load(weights_path_convnext_320_focal)['model_state_dict'])
vit_backbone_convnext_320_focal.half()   # Apply half precision to the backbone model
vit_backbone_convnext_320_focal.eval()   # Dropping unecessary layers
model_convnext_320_focal = vit_backbone_convnext_320_focal
model_convnext_320_focal.cuda();

In [13]:
# import torch
# # Specify the path for the binary file
# binary_file_path = "/kaggle/working/model.bin"

# # Save the model's state dictionary to the binary file
# torch.save(model, binary_file_path)

In [14]:
# import zipfile
# import os

# def zip_folder(folder_path, zip_filename):
#     with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
#         for root, _, files in os.walk(folder_path):
#             for file in files:
#                 file_path = os.path.join(root, file)
#                 arcname = os.path.relpath(file_path, folder_path)
#                 zipf.write(file_path, arcname)

# # Replace 'your_folder_path' with the actual path to the folder you want to zip
# folder_to_zip = '/kaggle/working'
# output_zip_path = 'VIT-H-14.zip'

# zip_folder(folder_to_zip, output_zip_path)

# from IPython.display import FileLink

# # Display a download link for the zip file
# FileLink(output_zip_path)

In [15]:
# !rm model1.pt

In [16]:
transform = get_transform()

img_dir = "/kaggle/input/vprtestdata/public_dataset/"

dataset_train = SubmissionDataset(img_dir, os.path.join(img_dir, "gallery.csv"), transform)
dataloader_train = DataLoader(dataset_train, batch_size=512, num_workers=4)
dataset_test = SubmissionDataset(img_dir, os.path.join(img_dir, "queries.csv"), transform, with_bbox=True)
dataloader_test = DataLoader(dataset_test, batch_size=512, num_workers=4)

In [17]:
def predict(model_vit_h_14, model_vit_l_14, model_convnext_320_focal, weight):
    global feature_vectors_gallery, labels_gallery
    
    feature_vectors_gallery, labels_gallery = get_feature_vector(model_vit_h_14, model_vit_l_14, model_convnext_320_focal, dataloader_train, weight, 1)
    feature_vectors_query, labels_query = get_feature_vector(model_vit_h_14, model_vit_l_14, model_convnext_320_focal, dataloader_test, weight, 1)
    
    scores, indices = get_similiarity_l2(feature_vectors_gallery, feature_vectors_query, 1000)

    indices = indices.tolist()
    labels_gallery = labels_gallery.tolist()
    labels_query = labels_query.tolist()

    return indices

In [18]:
seller_gt = pd.read_csv('/kaggle/input/vprtestdata/public_dataset/gallery.csv')
gallery_labels = seller_gt['product_id'].values
user_gt = pd.read_csv('/kaggle/input/vprtestdata/public_dataset/queries.csv')
query_labels = user_gt['product_id'].values

In [19]:
# weights = [[0,0],[0.2,0.2], [0.3,0.3], [0.5,0.5], [0.6,0.6], [0.7,0.7], [0.75, 0.75], [0.5,0.7], [0.6,0.7], [0.8,0.8], [0.9,0.9], [1,1]]
weights = [[0.7,0],[0.7,0.2], [0.7,0.3], [0.7,0.5], [0.7,0.6], [0.7,0.7], [0.75, 0.75], [0.7,0.7], [0.9,0.7], [0.7,0.8], [0.7,0.9], [1,1], [0.9,0],[1,0.2], [0.9,0.3], [1,0.5], [1,0.6]]

for weight in weights:
    
    preds = predict(model_vit_h_14, model_vit_h_14, model_convnext_320_focal, weight)

#     preds_df = pd.DataFrame(preds)
#     preds_df.to_csv('preds.csv', index=False)

    # Evalaute metrics
    print(f"Evaluation Results  of {weight}")
    results = {"mAP": calculate_map(np.array(preds), query_labels, gallery_labels)}
    print(results)

100%|██████████| 3/3 [01:15<00:00, 25.27s/it]
100%|██████████| 4/4 [02:04<00:00, 31.25s/it]


Processing indices...
Finished processing indices, took 9.38417387008667s
Evaluation Results  of [0.7, 0]


100%|██████████| 1935/1935 [00:00<00:00, 3777.72it/s]


{'mAP': 0.6074891405756979}


100%|██████████| 3/3 [01:10<00:00, 23.47s/it]
100%|██████████| 4/4 [02:08<00:00, 32.04s/it]


Processing indices...
Finished processing indices, took 9.484185457229614s
Evaluation Results  of [0.7, 0.2]


100%|██████████| 1935/1935 [00:00<00:00, 3985.50it/s]


{'mAP': 0.6067303264241279}


100%|██████████| 3/3 [01:12<00:00, 24.13s/it]
100%|██████████| 4/4 [02:08<00:00, 32.14s/it]


Processing indices...
Finished processing indices, took 9.122952699661255s
Evaluation Results  of [0.7, 0.3]


100%|██████████| 1935/1935 [00:00<00:00, 3983.83it/s]


{'mAP': 0.6066751177094398}


100%|██████████| 3/3 [01:12<00:00, 24.11s/it]
100%|██████████| 4/4 [02:08<00:00, 32.22s/it]


Processing indices...
Finished processing indices, took 8.885257482528687s
Evaluation Results  of [0.7, 0.5]


100%|██████████| 1935/1935 [00:00<00:00, 4048.57it/s]


{'mAP': 0.60708296114074}


100%|██████████| 3/3 [01:12<00:00, 24.24s/it]
100%|██████████| 4/4 [02:09<00:00, 32.30s/it]


Processing indices...
Finished processing indices, took 9.604161500930786s
Evaluation Results  of [0.7, 0.6]


100%|██████████| 1935/1935 [00:00<00:00, 3987.22it/s]


{'mAP': 0.6065924389008512}


100%|██████████| 3/3 [01:12<00:00, 24.01s/it]
100%|██████████| 4/4 [02:09<00:00, 32.30s/it]


Processing indices...
Finished processing indices, took 9.130436420440674s
Evaluation Results  of [0.7, 0.7]


100%|██████████| 1935/1935 [00:00<00:00, 3945.28it/s]


{'mAP': 0.6061729906908101}


100%|██████████| 3/3 [01:12<00:00, 24.17s/it]
100%|██████████| 4/4 [02:08<00:00, 32.21s/it]


Processing indices...
Finished processing indices, took 9.325860500335693s
Evaluation Results  of [0.75, 0.75]


100%|██████████| 1935/1935 [00:00<00:00, 3966.04it/s]


{'mAP': 0.6065924409556892}


100%|██████████| 3/3 [01:12<00:00, 24.13s/it]
100%|██████████| 4/4 [02:09<00:00, 32.30s/it]


Processing indices...
Finished processing indices, took 9.279486179351807s
Evaluation Results  of [0.7, 0.7]


100%|██████████| 1935/1935 [00:00<00:00, 3919.81it/s]


{'mAP': 0.6061729906908101}


100%|██████████| 3/3 [01:12<00:00, 24.27s/it]
100%|██████████| 4/4 [02:09<00:00, 32.34s/it]


Processing indices...
Finished processing indices, took 8.73986530303955s
Evaluation Results  of [0.9, 0.7]


100%|██████████| 1935/1935 [00:00<00:00, 4022.18it/s]


{'mAP': 0.6067767023008861}


100%|██████████| 3/3 [01:15<00:00, 25.00s/it]
100%|██████████| 4/4 [02:15<00:00, 33.77s/it]


Processing indices...
Finished processing indices, took 8.774376153945923s
Evaluation Results  of [0.7, 0.8]


100%|██████████| 1935/1935 [00:00<00:00, 4046.86it/s]


{'mAP': 0.6065345286753713}


100%|██████████| 3/3 [01:16<00:00, 25.58s/it]
100%|██████████| 4/4 [02:14<00:00, 33.63s/it]


Processing indices...
Finished processing indices, took 9.111866474151611s
Evaluation Results  of [0.7, 0.9]


100%|██████████| 1935/1935 [00:00<00:00, 3988.06it/s]


{'mAP': 0.6066156422862413}


100%|██████████| 3/3 [01:16<00:00, 25.51s/it]
100%|██████████| 4/4 [02:14<00:00, 33.54s/it]


Processing indices...
Finished processing indices, took 8.69758677482605s
Evaluation Results  of [1, 1]


100%|██████████| 1935/1935 [00:00<00:00, 4093.30it/s]


{'mAP': 0.6074216418848573}


100%|██████████| 3/3 [01:16<00:00, 25.44s/it]
100%|██████████| 4/4 [02:14<00:00, 33.54s/it]


Processing indices...
Finished processing indices, took 9.302047729492188s
Evaluation Results  of [0.9, 0]


100%|██████████| 1935/1935 [00:00<00:00, 3876.18it/s]


{'mAP': 0.6073537796545744}


100%|██████████| 3/3 [01:13<00:00, 24.51s/it]
100%|██████████| 4/4 [02:09<00:00, 32.40s/it]


Processing indices...
Finished processing indices, took 9.102653741836548s
Evaluation Results  of [1, 0.2]


100%|██████████| 1935/1935 [00:00<00:00, 4015.29it/s]


{'mAP': 0.6059358376719611}


100%|██████████| 3/3 [01:12<00:00, 24.22s/it]
100%|██████████| 4/4 [02:08<00:00, 32.24s/it]


Processing indices...
Finished processing indices, took 9.420027256011963s
Evaluation Results  of [0.9, 0.3]


100%|██████████| 1935/1935 [00:00<00:00, 3744.58it/s]


{'mAP': 0.6075447900051979}


100%|██████████| 3/3 [01:12<00:00, 24.11s/it]
100%|██████████| 4/4 [02:08<00:00, 32.19s/it]


Processing indices...
Finished processing indices, took 9.274156093597412s
Evaluation Results  of [1, 0.5]


100%|██████████| 1935/1935 [00:00<00:00, 3924.68it/s]


{'mAP': 0.6068466442156133}


100%|██████████| 3/3 [01:12<00:00, 24.15s/it]
100%|██████████| 4/4 [02:08<00:00, 32.09s/it]


Processing indices...
Finished processing indices, took 9.145113468170166s
Evaluation Results  of [1, 0.6]


100%|██████████| 1935/1935 [00:00<00:00, 3937.28it/s]

{'mAP': 0.6073293783295821}





In [20]:
preds_df.head()

NameError: name 'preds_df' is not defined

In [None]:
# Evalaute metrics
print("Evaluation Results")
results = {"mAP": calculate_map(np.array(preds), query_labels, gallery_labels)}
print(results)

In [None]:
# # Evaluation Results - 0.75
# # 100%|██████████| 1935/1935 [00:00<00:00, 4244.84it/s]
# # {'mAP': 0.6066268859082989}

# 100%|██████████| 3/3 [00:56<00:00, 18.86s/it]
# 100%|██████████| 4/4 [01:20<00:00, 20.24s/it]
# Processing indices...
# Finished processing indices, took 5.250295877456665s
# Evaluation Results  of 0
# 100%|██████████| 1935/1935 [00:00<00:00, 4115.37it/s]
# {'mAP': 0.6076191768389487}
# 100%|██████████| 3/3 [00:47<00:00, 15.72s/it]
# 100%|██████████| 4/4 [01:23<00:00, 20.87s/it]
# Processing indices...
# Finished processing indices, took 5.076934337615967s
# Evaluation Results  of 1
# 100%|██████████| 1935/1935 [00:00<00:00, 4180.41it/s]
# {'mAP': 0.6065552186488776}
# 100%|██████████| 3/3 [00:47<00:00, 15.89s/it]
# 100%|██████████| 4/4 [01:23<00:00, 20.97s/it]
# Processing indices...
# Finished processing indices, took 5.240142345428467s
# Evaluation Results  of 2
# 100%|██████████| 1935/1935 [00:00<00:00, 4246.23it/s]
# {'mAP': 0.607245341224875}
# 100%|██████████| 3/3 [00:47<00:00, 15.81s/it]
# 100%|██████████| 4/4 [01:23<00:00, 20.81s/it]
# Processing indices...
# Finished processing indices, took 5.046827077865601s
# Evaluation Results  of 3
# 100%|██████████| 1935/1935 [00:00<00:00, 4158.59it/s]
# {'mAP': 0.6064373525552245}
# 100%|██████████| 3/3 [00:47<00:00, 15.80s/it]
# 100%|██████████| 4/4 [01:23<00:00, 20.89s/it]
# Processing indices...
# Finished processing indices, took 5.576266527175903s
# Evaluation Results  of 4
# 100%|██████████| 1935/1935 [00:00<00:00, 4115.63it/s]
# {'mAP': 0.6058129854966234}
# 100%|██████████| 3/3 [00:48<00:00, 16.23s/it]
# 100%|██████████| 4/4 [01:24<00:00, 21.09s/it]
# Processing indices...
# Finished processing indices, took 5.483680963516235s
# Evaluation Results  of 5
# 100%|██████████| 1935/1935 [00:00<00:00, 3925.31it/s]
# {'mAP': 0.6054463695893347}
# 100%|██████████| 3/3 [00:48<00:00, 16.06s/it]
# 100%|██████████| 4/4 [01:23<00:00, 20.90s/it]
# Processing indices...
# Finished processing indices, took 5.089120626449585s
# Evaluation Results  of 6
# 100%|██████████| 1935/1935 [00:00<00:00, 4016.65it/s]
# {'mAP': 0.6065924409556892}
# 100%|██████████| 3/3 [00:47<00:00, 15.97s/it]
# 100%|██████████| 4/4 [01:23<00:00, 20.77s/it]
# Processing indices...
# Finished processing indices, took 5.112382411956787s
# Evaluation Results  of 7
# 100%|██████████| 1935/1935 [00:00<00:00, 4208.87it/s]
# {'mAP': 0.6074891405756979}
# 100%|██████████| 3/3 [00:47<00:00, 15.74s/it]
# 100%|██████████| 4/4 [01:23<00:00, 20.93s/it]
# Processing indices...
# Finished processing indices, took 5.110339879989624s
# Evaluation Results  of 8
# 100%|██████████| 1935/1935 [00:00<00:00, 4135.83it/s]
# {'mAP': 0.606011303904902}
# 100%|██████████| 3/3 [00:48<00:00, 16.06s/it]
# 100%|██████████| 4/4 [01:23<00:00, 20.90s/it]
# Processing indices...
# Finished processing indices, took 4.96556830406189s
# Evaluation Results  of 9
# 100%|██████████| 1935/1935 [00:00<00:00, 4257.99it/s]
# {'mAP': 0.6073537796545744}



# 100%|██████████| 3/3 [01:04<00:00, 21.59s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.41s/it]
# Processing indices...
# Finished processing indices, took 8.623145341873169s
# Evaluation Results  of [0, 0]
# 100%|██████████| 1935/1935 [00:00<00:00, 4128.46it/s]
# {'mAP': 0.6076191768389487}
# 100%|██████████| 3/3 [01:07<00:00, 22.44s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.48s/it]
# Processing indices...
# Finished processing indices, took 8.923219680786133s
# Evaluation Results  of [0.2, 0.2]
# 100%|██████████| 1935/1935 [00:00<00:00, 4122.74it/s]
# {'mAP': 0.6072454513689215}
# 100%|██████████| 3/3 [01:07<00:00, 22.51s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.45s/it]
# Processing indices...
# Finished processing indices, took 8.427476406097412s
# Evaluation Results  of [0.3, 0.3]
# 100%|██████████| 1935/1935 [00:00<00:00, 4019.73it/s]
# {'mAP': 0.606459867453295}
# 100%|██████████| 3/3 [01:07<00:00, 22.36s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.30s/it]
# Processing indices...
# Finished processing indices, took 8.832993984222412s
# Evaluation Results  of [0.5, 0.5]
# 100%|██████████| 1935/1935 [00:00<00:00, 3979.45it/s]
# {'mAP': 0.606632983911408}
# 100%|██████████| 3/3 [01:07<00:00, 22.34s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.31s/it]
# Processing indices...
# Finished processing indices, took 8.593319654464722s
# Evaluation Results  of [0.6, 0.6]
# 100%|██████████| 1935/1935 [00:00<00:00, 4169.21it/s]
# {'mAP': 0.6064449551494654}
# 100%|██████████| 3/3 [01:07<00:00, 22.35s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.39s/it]
# Processing indices...
# Finished processing indices, took 8.521270513534546s
# Evaluation Results  of [0.7, 0.7]
# 100%|██████████| 1935/1935 [00:00<00:00, 4123.58it/s]
# {'mAP': 0.6061729906908101}
# 100%|██████████| 3/3 [01:06<00:00, 22.32s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.41s/it]
# Processing indices...
# Finished processing indices, took 8.39121150970459s
# Evaluation Results  of [0.75, 0.75]
# 100%|██████████| 1935/1935 [00:00<00:00, 4234.19it/s]
# {'mAP': 0.6065924409556892}
# 100%|██████████| 3/3 [01:07<00:00, 22.35s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.29s/it]
# Processing indices...
# Finished processing indices, took 8.813293218612671s
# Evaluation Results  of [0.5, 0.7]
# 100%|██████████| 1935/1935 [00:00<00:00, 4196.53it/s]
# {'mAP': 0.60708296114074}
# 100%|██████████| 3/3 [01:06<00:00, 22.31s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.29s/it]
# Processing indices...
# Finished processing indices, took 8.486764430999756s
# Evaluation Results  of [0.6, 0.7]
# 100%|██████████| 1935/1935 [00:00<00:00, 4005.34it/s]
# {'mAP': 0.6065924389008512}
# 100%|██████████| 3/3 [01:06<00:00, 22.33s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.28s/it]
# Processing indices...
# Finished processing indices, took 8.34221339225769s
# Evaluation Results  of [0.8, 0.8]
# 100%|██████████| 1935/1935 [00:00<00:00, 4144.97it/s]
# {'mAP': 0.6070249322293987}
# 100%|██████████| 3/3 [01:07<00:00, 22.47s/it]
# 100%|██████████| 4/4 [01:57<00:00, 29.26s/it]
# Processing indices...
# Finished processing indices, took 8.509763479232788s
# Evaluation Results  of [0.9, 0.9]
# 100%|██████████| 1935/1935 [00:00<00:00, 3989.92it/s]
# {'mAP': 0.6067933927435037}
# 100%|██████████| 3/3 [01:07<00:00, 22.37s/it]
# 100%|██████████| 4/4 [01:56<00:00, 29.22s/it]
# Processing indices...
# Finished processing indices, took 8.44943618774414s
# Evaluation Results  of [1, 1]
# 100%|██████████| 1935/1935 [00:00<00:00, 4211.53it/s]
# {'mAP': 0.6074216418848573}

# sample image similarity search

In [None]:
len(current_retrievals)

In [None]:
def transform_img(image):
    img = image
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    if isinstance(img, np.ndarray):
        img =  Image.fromarray(img)
        
    img = transform(img)

    return img

In [None]:
@th.no_grad()
def get_feature_vector_img(model_vit_h_14, model_vit_l_14, imgs, use_cuda=True):
    features = []
    if use_cuda:
        imgs = imgs.cuda()
    x = (model(imgs.half())).detach().cpu().numpy().astype(np.float32)  # .half()
    print(model(imgs.half()).shape)
    features.append(x)

    return np.concatenate(features, axis=0)

In [None]:
img_path = '/kaggle/input/vprtestdata/public_dataset/queries/accelerated-glorious-fennec-of-reward.jpg'

def get_similar_prods(img_path):

    image = read_image(img_path)
    image = transform_img(image)
    image = image.unsqueeze(dim=0)
#     feature_vectors_gallery, labels_gallery = get_feature_vector(model, dataloader_train, 1)
    feature_vectors_query = get_feature_vector_img(model_vit_h_14, model_vit_l_14, image, 1)
    scores, indices = get_similiarity_l2(feature_vectors_gallery, feature_vectors_query, 1000)
    preds = convert_indices_to_labels(indices, labels_gallery)
    indices = indices.tolist()

    return [indices , preds]
    
[similar_images, labels] = get_similar_prods(img_path)

In [None]:
csv_path_q = '/kaggle/input/vprtestdata/public_dataset/queries.csv'  
data_q = pd.read_csv(csv_path_q)

x = data_q[data_q['img_path'] == 'queries/accelerated-glorious-fennec-of-reward.jpg']

x

In [None]:
preds_df1 = pd.DataFrame(similar_images)

preds_df1

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

img = Image.open(img_path)

img = img.resize((224, 224))

plt.figure(figsize=(4, 4))
plt.imshow(img)
plt.axis('off')
plt.savefig('query.png')  
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

csv_path = '/kaggle/input/vprtestdata/public_dataset/gallery.csv'  
data = pd.read_csv(csv_path)

prod_ids = similar_images[0][:100]  

num_images = len(prod_ids)
num_columns = 10
num_rows = (num_images + num_columns - 1) // num_columns

fig, axes = plt.subplots(num_rows, num_columns, figsize=(15, 15))

for i, prod_id in enumerate(prod_ids):
    row = data[data['seller_img_id'] == prod_id]
    
    if not row.empty:
        image_path = '/kaggle/input/vprtestdata/public_dataset/' + row.iloc[0]['img_path']
        img = Image.open(image_path)
        
        img = img.resize((224, 224))
        
        row_idx = i // num_columns
        col_idx = i % num_columns
        
        ax = axes[row_idx, col_idx]
        ax.imshow(img)
        ax.set_title(f"Image ID: {prod_id}")
        ax.axis('off')

for i in range(num_images, num_rows * num_columns):
    fig.delaxes(axes.flatten()[i])

plt.tight_layout()

plt.savefig('inference.png')  

plt.show()


In [None]:
# # Download custom image
# import requests

# # Setup custom image path
# custom_image_path = data_path / "04-pizza-dad.jpeg"

# # Download the image if it doesn't already exist
# if not custom_image_path.is_file():
#     with open(custom_image_path, "wb") as f:
#         # When downloading from GitHub, need to use the "raw" file link
#         request = requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/main/images/04-pizza-dad.jpeg")
#         print(f"Downloading {custom_image_path}...")
#         f.write(request.content)
# else:
#     print(f"{custom_image_path} already exists, skipping download.")

In [None]:
import json
import requests
import torch
from torch import nn
import numpy as np
from zipfile import ZipFile
from sklearn.decomposition import PCA
from torchvision.transforms import InterpolationMode
from torchvision.transforms.functional import resize, normalize
from open_clip import create_model_and_transforms, tokenize


class CLIP64(nn.Module):
    def __init__(self, model_name, pretrained, dimensionality_reduction='PCA'):
        """
        Load a CLIP model and append a PCA layer or a random choice among the head neurons to obtain a 64D vector 
        The PCA is obtained on embeddings of plausible labels generated by GPT-3
        """
        super().__init__()
        self.dimensionality_reduction = dimensionality_reduction

        # Load model and transforms
        model, transforms, _ = create_model_and_transforms(model_name, pretrained, jit=False, device='cuda')

        # Transforms parameters
        self.image_size = transforms.transforms[0].size[0]
        self.mean = transforms.transforms[-1].mean
        self.std = transforms.transforms[-1].std

        # PCA using GPT-3 captions
        if self.dimensionality_reduction == 'PCA':
            W_text = []
            url = 'https://raw.githubusercontent.com/IvanAer/G-Universal-CLIP/main/media/GPT3_words.json'
            gpt3_words = requests.get(url).json()
            for word in gpt3_words:
                w = tokenize(word)
                w = model.encode_text(w.to('cuda'))[0].detach().cpu()
                w /= w.norm()
                w = w.numpy()
                W_text.append(w)
            pca = PCA(64, whiten=True)
            pca.fit(W_text)
            self.pca_components = torch.Tensor(pca.components_).half().to('cuda')
            self.pca_mean = torch.Tensor(pca.mean_).half().to('cuda')
        elif self.dimensionality_reduction == 'random':
            W = model.visual.proj.detach().cpu().numpy()
            mask64 = np.random.permutation(W.shape[1])[:64]
            model.visual.proj = nn.Parameter(torch.tensor(W[:, mask64]).half().to('cuda'))
            # Needed for torchscript
            self.pca_components = torch.Tensor(np.zeros((W.shape[1], 1))).half().to('cuda')
            self.pca_mean = torch.Tensor(np.zeros(W.shape[1])).half().to('cuda')
            
        else:
            raise ValueError

        # Set encoder
        self.encoder = model.visual.half().eval()

    def forward(self, image):
        """
        The input image is padded and resized to the size required by the CLIP visual encoder
        The PCA layer is then applied to the output of the encode
        """

        # Pad
        h, w = image.size()[2:]
        p_left, p_top = [(max(h, w) - s) // 2 for s in [h, w]]
        p_right, p_bottom = [max(h, w) - (s + pad) for s, pad in zip([h, w], [p_left, p_top])]
        value = 255. * sum(self.mean) / 3
        image = nn.functional.pad(image, [p_top, p_bottom, p_left, p_right], 'constant', value)

        # Resize
        image = resize(image, size=(self.image_size, self.image_size), interpolation=InterpolationMode.BICUBIC)

        # Normalize
        image = image.half()
        image /= 255.
        image = normalize(image, mean=self.mean, std=self.std)

        # Run feature extractor
        features = self.encoder(image.to('cuda'))[0]

        # Apply dimensionality reduction
        features /= features.norm()
        if self.dimensionality_reduction == 'PCA':
            features = self.pca_components @ (features - self.pca_mean)
        features = features.unsqueeze(0)

        return features


# Load model
#model = CLIP64('ViT-H-14', 'laion2b_s32b_b79k', 'random')
model = CLIP64('ViT-H-14', 'laion2b_s32b_b79k', 'PCA')

# Save it
model = torch.jit.script(model)
model.save('saved_model.pt')

with ZipFile('submission.zip', 'w') as z:
    z.write('saved_model.pt', arcname='saved_model.pt')
    
    
# Sanity check using challenge code

from PIL import Image
from torchvision import transforms

# Model loading
model = torch.jit.load('saved_model.pt')
model.eval()
embedding_fn = model

# Load image and extract its embedding.
url = 'https://images-eu.ssl-images-amazon.com/images/I/712Qd71eiYL.__AC_SY300_SX300_QL70_ML2_.jpg'
input_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
convert_to_tensor = transforms.Compose([transforms.PILToTensor()])
input_tensor = convert_to_tensor(input_image)
input_batch = input_tensor.unsqueeze(0)
with torch.no_grad():
    embedding = torch.flatten(embedding_fn(input_batch)[0]).cpu().data.numpy()
    
print(input_image.size)
print(embedding.shape)
input_image