In [None]:
from torchmetrics.multimodal import CLIPScore

metric = CLIPScore(model_name_or_path="openai/clip-vit-large-patch14")

In [None]:
import torchvision.transforms as transforms
import cv2


def read_image(path):
    img = Image.open(path).convert('RGB')

    resize = transforms.Resize([224, 224])
    img = resize(img)
    to_tensor = transforms.ToTensor()

    tensor = to_tensor(img)
    return tensor


def show_image(path):
    image = cv2.imread(path)

    size = 400, 400
    image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    image.thumbnail(size, Image.Resampling.LANCZOS)

    return image

In [None]:
name = "clipscore_test/1.jpg"
text = "Хлеб розетка вселенная"
show_image(name)

In [None]:
cocoScore = metric(read_image(name), text)
cocoScore.detach()

In [None]:
from multilingual_clip import pt_multilingual_clip
import transformers
import torch
import open_clip
from PIL import Image

model_name = 'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus'

model_caption = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model_img, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16-plus-240', pretrained="laion400m_e32")
model_img.to(device)

In [None]:
def compare_embeddings(logit_scale, img_embs, txt_embs):
    # normalized features
    image_features = img_embs / img_embs.norm(dim=-1, keepdim=True)
    text_features = txt_embs / txt_embs.norm(dim=-1, keepdim=True)

    # cosine similarity as logits
    logits_per_image = logit_scale * image_features @ text_features.t()
    logits_per_text = logit_scale * text_features @ image_features.t()

    # shape = [global_batch_size, global_batch_size]
    return logits_per_image, logits_per_text

In [None]:
logit_scale = model_img.logit_scale.exp().float()

In [None]:
def give_logits(text, name):
    text_features = model_caption.forward(text, tokenizer).detach().cpu()
    with Image.open(name) as curimg:
        images = preprocess(curimg).unsqueeze(0)
    image_features = model_img.encode_image(images.to(device)).detach().float()
    img_logits, text_logits = compare_embeddings(logit_scale, image_features, text_features.to(device))
    return img_logits

In [None]:
name1 = "clipscore_test/2.jpg"
show_image(name1)

In [None]:
#«»
text1 = "Театр в городе Нижний Новгород"
give_logits(text1, name1)

In [None]:
name2 = "clipscore_test/3.jpg"
show_image(name2)

In [None]:
text2 = "Александр Сергеевич Пушкин"
give_logits(text2, name2)