In [1]:
import torch
import requests
import pandas as pd
from PIL import Image
from io import BytesIO
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from tqdm import tqdm

# Kích hoạt tiến trình cho pandas
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_model_info(model_id, device):

  # Save the model to device
  model = CLIPModel.from_pretrained(model_id).to(device)

  # Get the processor
  processor = CLIPProcessor.from_pretrained(model_id)

  # Get the tokenizer
  tokenizer = CLIPTokenizer.from_pretrained(model_id)

  # Return model, processor & tokenizer
  return model, processor, tokenizer

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_ID = "openai/clip-vit-base-patch32"
model, processor, tokenizer = get_model_info(model_ID, device)



In [4]:
def get_single_text_embedding(text):
    # trả về là 1 tensor
    inputs = tokenizer(text, return_tensors='pt')
    text_embeddings = model.get_text_features(**inputs)
    # convert the embeddings to numpy array
    embedding_as_np = text_embeddings.cpu().detach().numpy()
    return embedding_as_np[0].tolist()


embedding = get_single_text_embedding('a very typical bus station')

In [5]:
import chromadb
db_path = r"./server/database/database_multimodal"

client = chromadb.PersistentClient(path=db_path)

In [7]:
collection = client.get_or_create_collection(
    name='text_to_image_collection',
    metadata={"hnsw:space": "cosine"},
)

In [11]:
embedding = get_single_text_embedding(
    'functions of government : 1 . form a more perfect union')
results = collection.query(
    query_embeddings=embedding,
    n_results=5,
    include=['metadatas', 'distances']
)

In [12]:
results

{'ids': [['id_9', 'id_945', 'id_166', 'id_373', 'id_1331']],
 'distances': [[0.5971044301986694,
   0.7362351417541504,
   0.7395954132080078,
   0.7414260506629944,
   0.7416597008705139]],
 'metadatas': [[{'caption': 'functions of government : 1 . form a more perfect union'},
   {'caption': 'large group of people in the shape of circle .'},
   {'caption': 'peaking through the blossoms to catch a glimpse .'},
   {'caption': 'parts of the fish and the function'},
   {'caption': 'a flag waves at the building as filming location prepares for second inauguration .'}]],
 'embeddings': None,
 'documents': None,
 'uris': None,
 'data': None}