In [8]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

In [9]:
from metaclip_arch.metaclip_v5 import MetaCLIP
device = "cuda:0" if torch.cuda.is_available() else "cpu"

metaclip_model = MetaCLIP(ratio=0.5)
path_weights = "./weights/metaclip_v5_ratio_05.pth"

metaclip_model.load_state_dict(torch.load(path_weights, map_location=torch.device('cpu')) )
metaclip_model.to(device)
metaclip_model.eval()

MetaCLIP(
  (encode_image): ImageEncoder(
    (fc1): Linear(in_features=768, out_features=1380, bias=True)
    (fc2): Linear(in_features=1380, out_features=768, bias=True)
  )
  (encode_text): TextEncoder(
    (fc1): Linear(in_features=768, out_features=1380, bias=True)
    (fc2): Linear(in_features=1380, out_features=768, bias=True)
  )
)

In [13]:
df_train_clip = pd.DataFrame()

with open('/home/docker_current/py_files/MLCLIP_exp/image_feat_array.npy', 'rb') as f:
    image_feat_array = np.load(f)

with open('/home/docker_current/py_files/MLCLIP_exp/text_feat_array.npy', 'rb') as f:
    text_feat_array = np.load(f)

X_train, X_test, y_train, y_test = train_test_split(text_feat_array, image_feat_array, 
                                                        test_size=0.25, random_state=42)

# просто весь датасет
# df_train_clip['image_features'] = [i.reshape(1,-1) for i in image_feat_array]
# df_train_clip['text_features'] = [i.reshape(1,-1) for i in text_feat_array]

# только тестовый датасет с примененным метаклипом
# df_train_clip['image_features'] = [metaclip_model.encode_image(torch.Tensor(i.reshape(1,-1)).to(device)).cpu().detach().numpy() for i in X_test]
# df_train_clip['text_features'] = [metaclip_model.encode_text(torch.Tensor(i.reshape(1,-1)).to(device)).cpu().detach().numpy() for i in y_test]

# весь датасет с примененным метаклипом
df_train_clip['image_features'] = [metaclip_model.encode_image(torch.Tensor(i.reshape(1,-1)).to(device)).cpu().detach().numpy() for i in text_feat_array]
df_train_clip['text_features'] = [metaclip_model.encode_text(torch.Tensor(i.reshape(1,-1)).to(device)).cpu().detach().numpy() for i in image_feat_array]

In [14]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

def get_similarity(df, index_text, index_image):
    sim =cos(torch.Tensor(df_train_clip['text_features'][index_text]), 
                        torch.Tensor(df_train_clip['image_features'][index_image]))
    return sim

In [15]:
%%time
from tqdm.notebook import tqdm

count_true_map = 0
preds = []

for ind_text in tqdm(range(len(df_train_clip))):

    sims = []

    for ind_image in range(len(df_train_clip)):
        sim = get_similarity(df_train_clip, ind_text, ind_image)
        sims.append(sim)

    sims = np.array(sims)
    ind_max = np.argmax(sims)
    preds.append(ind_max)
    # print(ind_max, sims[ind_max])

    if ind_max == ind_text:
        count_true_map += 1

count_true_map

  0%|          | 0/2098 [00:00<?, ?it/s]

CPU times: user 4min 53s, sys: 907 ms, total: 4min 54s
Wall time: 4min 54s


1208

In [16]:
count_true_map, len(df_train_clip)

(1208, 2098)

#### 1208 - v5 ratio 0.5 not full
#### 1062 - v4 full ratio 0.2
#### 1665 - v3 symmetr full

#### 237 из 525 +++  v5 ratio 0.5
#### 227 из 525 +++ v4 960 из 2098 - residual
#### 224 из 525 +++ v4 876 из 2098 - residual + sym_dataset