In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

In [2]:
from metaclip_arch.metaclip_v4 import MetaCLIP
device = "cuda:0" if torch.cuda.is_available() else "cpu"

metaclip_model = MetaCLIP()
metaclip_model.eval()
path_weights = "./weights/metaclip_v4.pth"

metaclip_model.load_state_dict(torch.load(path_weights, map_location=torch.device('cpu')) )
metaclip_model.to(device)

MetaCLIP(
  (encode_image): ImageEncoder(
    (fc1): Linear(in_features=768, out_features=1380, bias=True)
    (fc2): Linear(in_features=1380, out_features=768, bias=True)
  )
  (encode_text): TextEncoder(
    (fc1): Linear(in_features=768, out_features=1380, bias=True)
    (fc2): Linear(in_features=1380, out_features=768, bias=True)
  )
)

In [3]:
df_train_clip = pd.DataFrame()

with open('/home/docker_current/py_files/MLCLIP_exp/image_feat_array.npy', 'rb') as f:
    image_feat_array = np.load(f)

with open('/home/docker_current/py_files/MLCLIP_exp/text_feat_array.npy', 'rb') as f:
    text_feat_array = np.load(f)

X_train, X_test, y_train, y_test = train_test_split(text_feat_array, image_feat_array, 
                                                        test_size=0.25, random_state=42)

# просто весь датасет
# df_train_clip['image_features'] = [i.reshape(1,-1) for i in image_feat_array]
# df_train_clip['text_features'] = [i.reshape(1,-1) for i in text_feat_array]

# только тестовый датасет с примененным метаклипом
df_train_clip['image_features'] = [metaclip_model.encode_image(torch.Tensor(i.reshape(1,-1)).to(device)).cpu().detach().numpy() for i in X_test]
df_train_clip['text_features'] = [metaclip_model.encode_text(torch.Tensor(i.reshape(1,-1)).to(device)).cpu().detach().numpy() for i in y_test]

# весь датасет с примененным метаклипом
# df_train_clip['image_features'] = [metaclip_model.encode_image(torch.Tensor(i.reshape(1,-1)).to(device)).cpu().detach().numpy() for i in text_feat_array]
# df_train_clip['text_features'] = [metaclip_model.encode_text(torch.Tensor(i.reshape(1,-1)).to(device)).cpu().detach().numpy() for i in image_feat_array]

In [4]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

def get_similarity(df, index_text, index_image):
    sim =cos(torch.Tensor(df_train_clip['text_features'][index_text]), 
                        torch.Tensor(df_train_clip['image_features'][index_image]))
    return sim

In [5]:
%%time
from tqdm.notebook import tqdm

count_true_map = 0
preds = []

for ind_text in tqdm(range(len(df_train_clip))):

    sims = []

    for ind_image in range(len(df_train_clip)):
        sim = get_similarity(df_train_clip, ind_text, ind_image)
        sims.append(sim)

    sims = np.array(sims)
    ind_max = np.argmax(sims)
    preds.append(ind_max)
    # print(ind_max, sims[ind_max])

    if ind_max == ind_text:
        count_true_map += 1

count_true_map

  0%|          | 0/525 [00:00<?, ?it/s]

CPU times: user 19.9 s, sys: 35.1 ms, total: 19.9 s
Wall time: 20 s


227

In [6]:
count_true_map, len(df_train_clip)

(227, 525)

In [None]:
1665