In [None]:
!pip install chromadb transformers torch


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import chromadb
from chromadb.config import Settings
from sklearn.metrics.pairwise import cosine_similarity

# Function to load the model and tokenizer
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

# Function to generate embeddings
def generate_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return np.array(embeddings)

# Function to store embeddings in ChromaDB
def store_embeddings_in_chromadb(embeddings, texts, collection_name="text_embeddings"):
    settings = Settings()
    client = chromadb.Client(settings)
    collection = client.create_collection(collection_name)
    for text, embedding in zip(texts, embeddings):
        collection.add(id=text, embedding=embedding.tolist())
    return client, collection

# Function to query embeddings from ChromaDB
def query_embeddings(collection, query_text, tokenizer, model):
    query_embedding = generate_embeddings([query_text], tokenizer, model)[0].tolist()
    results = collection.find_similar(embedding=query_embedding, top_k=5)
    return results

# Function to evaluate embeddings
def evaluate_embeddings(embeddings):
    cosine_sim_matrix = cosine_similarity(embeddings)
    return cosine_sim_matrix

# Example texts
texts = ["This is an example sentence.", "This is another sentence."]

# Load model and tokenizer
model_name = "huggingface-e5-base"
tokenizer, model = load_model_and_tokenizer(model_name)

# Generate embeddings
embeddings = generate_embeddings(texts, tokenizer, model)
print("Embeddings generated successfully.")

# Store embeddings in ChromaDB
client, collection = store_embeddings_in_chromadb(embeddings, texts)
print("Embeddings stored in ChromaDB successfully.")

# Query embeddings from ChromaDB
query_text = "This is an example sentence."
results = query_embeddings(collection, query_text, tokenizer, model)

# Display results
for result in results:
    print(f"Text: {result['embedding_id']}, Similarity: {result['similarity']}")

# Evaluate embeddings
cosine_sim_matrix = evaluate_embeddings(embeddings)
print("Cosine similarity matrix calculated successfully.")
print(cosine_sim_matrix)
