In [1]:
import sys
path = "../../.."
if path not in sys.path:
    sys.path.insert(0, path)

In [2]:
from data_retrieval import lipade_groundtruth
from data_retrieval.tools.data_loader import getDataLoader
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
from tqdm import tqdm
import numpy as np
import torch
import clip

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
corpus = "lipade_groundtruth"
distancePath = "../results/distance/" + corpus + "/"
rawPath = "../results/raw/" + corpus + "/"

In [4]:
x,_,y = lipade_groundtruth.getDataset(mode="similar")
for i in range(len(x)):
    x[i] = Image.open(x[i])

# 1- No Finetuning

In [3]:
print(clip.available_models())
modelName = 'ViT-L/14'
modelLowerName = "vitl14"

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']


In [None]:
model, preprocess = clip.load(modelName, device)

representations = []
with torch.no_grad():
    for i in tqdm(range(len(x)), desc=modelName):
        image = preprocess(x[i]).unsqueeze(0).to(device)
        representations.append(model.encode_image(image).cpu())

RN50: 100%|██████████| 279/279 [00:06<00:00, 46.04it/s]
RN101: 100%|██████████| 279/279 [00:04<00:00, 63.00it/s]
RN50x4: 100%|██████████| 279/279 [00:04<00:00, 61.71it/s]
RN50x16: 100%|██████████| 279/279 [00:06<00:00, 42.02it/s]
RN50x64: 100%|██████████| 279/279 [00:13<00:00, 21.12it/s]
ViT-B/32: 100%|██████████| 279/279 [00:03<00:00, 77.78it/s]
ViT-B/16: 100%|██████████| 279/279 [00:03<00:00, 77.37it/s]
ViT-L/14: 100%|██████████| 279/279 [00:05<00:00, 51.26it/s]
ViT-L/14@336px: 100%|██████████| 279/279 [00:07<00:00, 35.62it/s]


In [None]:
representations = np.array(representations).reshape((len(x),-1))
sim = np.array(cosine_similarity(representations, representations))

distance = 1 - (sim+1)/2
distance -= np.diag(distance)

In [None]:
np.save(distancePath + "clip_transfer_" + modelLowerName + ".npy", distance)
np.save(rawPath + "clip_transfer_" + modelLowerName + ".npy", representations)

# 2- Finetuning (using generated captions)

In [13]:
print(clip.available_models())
modelName = 'ViT-B/32'
modelLowerName = "vitb32"
name_test = "finetune_3_layers"

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']


In [7]:
model, preprocess = clip.load(modelName, device)

representations = []
with torch.no_grad():
    for i in tqdm(range(len(x)), desc=modelName):
        image = preprocess(x[i]).unsqueeze(0).to(device)
        representations.append(model.encode_image(image).cpu())

ViT-B/32: 100%|██████████| 279/279 [00:06<00:00, 46.46it/s]


In [8]:
representations = np.array(representations).reshape((len(x),-1))
sim = np.array(cosine_similarity(representations, representations))

distance = 1 - (sim+1)/2
distance -= np.diag(distance)

In [9]:
np.save(distancePath + "clip_transfer_" + modelLowerName + ".npy", distance)
np.save(rawPath + "clip_transfer_" + modelLowerName + ".npy", representations)

In [15]:
model.load_state_dict(torch.load(f'../results/weights/clip/model_{name_test}.pth', weights_only=True, map_location=device))

<All keys matched successfully>

In [16]:
representations = []
with torch.no_grad():
    for i in tqdm(range(len(x)), desc=modelName):
        image = preprocess(x[i]).unsqueeze(0).to(device)
        representations.append(model.encode_image(image).cpu())

representations = np.array(representations).reshape((len(x),-1))
sim = np.array(cosine_similarity(representations, representations))

distance = 1 - (sim+1)/2
distance -= np.diag(distance)

ViT-B/32: 100%|██████████| 279/279 [00:03<00:00, 80.48it/s]


In [17]:
np.save(distancePath + "clip_transfer_" + modelLowerName + "_finetuned.npy", distance)
np.save(rawPath + "clip_transfer_" + modelLowerName + "_finetuned.npy", representations)