In [1]:
import torch
import numpy as np
from PIL import Image
import io
from transformers import CLIPModel, CLIPProcessor


class ClipHFEmbedder:

    def __init__(self, model_name: str = "openai/clip-vit-base-patch32", device: str | None = None):
        self.model_name = model_name
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

        self.model = CLIPModel.from_pretrained(model_name).to(self.device).eval()
        self.processor = CLIPProcessor.from_pretrained(model_name)


    @property
    def dim(self) -> int:
        return self.model.config.projection_dim
    

    @torch.inference_mode()
    def embed_text(self, text: str) -> list[float]:
        inputs = self.processor(text=[text], return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        feat = self.model.get_text_features(**inputs)  # (1, 512)
        feat = torch.nn.functional.normalize(feat, p=2, dim=-1)
        return feat[0].detach().cpu().float().tolist()
    

    @torch.inference_mode()
    def embed_image_bytes(self, image_bytes: bytes) -> list[float]:
        img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        inputs = self.processor(images=[img], return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        feat = self.model.get_image_features(**inputs)  # (1, 512)
        feat = torch.nn.functional.normalize(feat, p=2, dim=-1)
        return feat[0].detach().cpu().float().tolist()

In [9]:
import transformers

transformers.__version__

'4.53.0'

In [2]:
cf = ClipHFEmbedder()

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [3]:
cf.embed_text("Мотор 8 клоп кондей рабочий \nПолных ход \nМашина в одних руках больше 2-х лет")

[-0.0366520993411541,
 0.007337946444749832,
 0.004607334733009338,
 -0.005023053847253323,
 -0.013062392361462116,
 -0.02014089561998844,
 0.011622598394751549,
 0.038894206285476685,
 -0.0546308308839798,
 -0.02306338958442211,
 -0.002907936926931143,
 0.013788428157567978,
 -0.002336578443646431,
 -0.018096555024385452,
 -0.0043523856438696384,
 0.022962048649787903,
 -0.001895398716442287,
 0.011784632690250874,
 -0.0028276783414185047,
 0.024638555943965912,
 -0.026723913848400116,
 0.0066513544879853725,
 -0.012667447328567505,
 0.008441303856670856,
 0.0046138153411448,
 -0.010045012459158897,
 0.019371362403035164,
 0.04582398384809494,
 -0.01932503469288349,
 -0.000808657321613282,
 -0.010095618665218353,
 -0.010092661716043949,
 0.002713638823479414,
 0.028187021613121033,
 -0.0009844944579526782,
 -0.021058451384305954,
 -0.03119523636996746,
 -0.01756504364311695,
 -0.02746935747563839,
 0.03764566779136658,
 -0.023113101720809937,
 0.016237985342741013,
 0.0164499413222074