In [1]:
import pandas as pd
import numpy as np
import torch
import os
import sys
sys.path.append("/home/docker_current/py_files/MLCLIP_exp")
from MLCLIP_utils import get_text_encode_model, get_image_encode_model

device = "cuda:1" if torch.cuda.is_available() else "cpu"

In [2]:
from metaclip_arch.metaclip_v5 import MetaCLIP

metaclip_model = MetaCLIP(ratio=0.5)
path_weights = "./weights/metaclip_v5_ratio_05.pth"

metaclip_model.load_state_dict(torch.load(path_weights, map_location=torch.device('cpu')) )
metaclip_model.to(device)
metaclip_model.eval()
metaclip_model.ratio

0.5

In [4]:
from PIL import Image

# model_name='M-CLIP/XLM-Roberta-Large-Vit-L-14' # самый первый, он же дефолтный
# model_name = 'M-CLIP/XLM-Roberta-Large-Vit-B-32' #2.24 Gb
# model_name = 'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus' # Судя по метрикам самый лучший из данного зоопарка

image_model, image_preproc = get_image_encode_model()
text_model, text_tokenizer = get_text_encode_model()

def get_image_features(key):
    sample_image_path = "/home/docker_current/datasets/test/" + str(key) + ".png"
    image = Image.open(sample_image_path)
    image = image_preproc(image).unsqueeze(0).to(device)

    with torch.no_grad():
        image_features = image_model.encode_image(image)
        image_features = image_features.type(torch.cuda.FloatTensor)
        # image_features = metaclip_model.encode_image(image_features.to(device)).cpu().detach().numpy()
        image_features = metaclip_model.encode_text(image_features.to(device)).cpu().detach().numpy()

    return image_features

def get_text_features(df, index_text):
    sample_text = df['description'][index_text]
    text_features = text_model.forward(sample_text, text_tokenizer)
    with torch.no_grad():
        # text_features = metaclip_model.encode_text(text_features.to(device)).cpu().detach().numpy()
        text_features = metaclip_model.encode_image(text_features.to(device)).cpu().detach().numpy()
    return text_features 

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
#считываем датафрейм, добавляем столбик для эмбеддингов
# test_images_path = "/home/docker_current/datasets/test"
df_test = pd.read_csv("/home/docker_current/datasets/test.csv")
df_test['text_features'] = None
df_test['object_img'] = None

# считываем тестовые изображения, делаем словарь с эмбеддингами
test_images = [i.split('.png')[0] for i in os.listdir("/home/docker_current/datasets/test")]
test_embed = {i:None for i in test_images}

df_test.head(3)

Unnamed: 0,id,description,text_features,object_img
0,486,Фотография. Елизавета Алексеевна Юманова. ПКМ...,,
1,813,Фотография. Заседание комитета комсомола мотор...,,
2,2980,"Фотография. День ""Саланга"". ПОКМ-18530/638 фо...",,


In [6]:
%%time
from tqdm.notebook import tqdm

for ind_text in tqdm(range(len(df_test))):
    df_test['text_features'][ind_text] = get_text_features(df_test, ind_text)

  0%|          | 0/900 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


CPU times: user 1h 14min 48s, sys: 1min 26s, total: 1h 16min 14s
Wall time: 10min 49s


In [7]:
for name_image in tqdm(test_embed.keys()):
    test_embed[name_image] = get_image_features(name_image)

  0%|          | 0/900 [00:00<?, ?it/s]

In [8]:
from copy import deepcopy
copy_test_embed = deepcopy(test_embed)

In [9]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

def get_similarity(image_emb, text_emb):
    sim =cos(torch.Tensor(image_emb), 
                        torch.Tensor(text_emb))
    return sim

In [10]:
preds = []

# ind_text = 0
for ind_text in tqdm(range(len(df_test))):

    sims = []
    for image_name in copy_test_embed.keys():
        sim = get_similarity(copy_test_embed[image_name], 
                                    df_test['text_features'][ind_text])
        sims.append(sim)

    sims = np.array(sims)
    ind_max = np.argmax(sims)
    match_image = list(copy_test_embed.keys())[ind_max]
    preds.append(match_image)
    del copy_test_embed[match_image]

  0%|          | 0/900 [00:00<?, ?it/s]

In [11]:
submit = pd.read_csv('/home/docker_current/py_files/sample_solution.csv')
submit['object_img'] = preds
submit['object_img'] = submit['object_img'].astype(np.int64)
submit.to_csv('./submits/metaclip_v5_ratio_05.csv', index=False)