In [1]:
from transformers import BertTokenizer, BertModel
import torch
import chromadb

  from .autonotebook import tqdm as notebook_tqdm
2025-10-16 15:03:48.528879: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 1️⃣ Initialize Chroma client (local persistent storage)
chroma_client = chromadb.PersistentClient(path="./chroma_db")

In [3]:
# 2️⃣ Create (or get) a collection
collection = chroma_client.get_or_create_collection(name="text_embeddings")


In [4]:
# 3️⃣ Load BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [5]:
# 4️⃣ Define sample texts
texts = [
    "A luminous red plastic bag sits alone on a dark beach.",
    "The ocean waves crash softly under the twilight sky.",
    "A glowing lantern floats over the calm river."
]

In [6]:
# 5️⃣ Generate embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.pooler_output[0]  # CLS token embedding
    return embedding.numpy().tolist()

embeddings = [get_embedding(t) for t in texts]


In [7]:
# 6️⃣ Add data to Chroma
collection.add(
    ids=["text1", "text2", "text3"],      # unique IDs
    documents=texts,                      # original text
    embeddings=embeddings                 # numerical vectors
)

print("✅ Text embeddings successfully stored in ChromaDB!")

✅ Text embeddings successfully stored in ChromaDB!


In [8]:
# 7️⃣ Perform a semantic search
query_text = "A glowing red bag by the sea."
query_emb = get_embedding(query_text)

results = collection.query(
    query_embeddings=[query_emb],
    n_results=3  # top 3 most similar texts
)

In [9]:
# 8️⃣ Display results
print("\n🔍 Query Results:")
for doc, dist in zip(results["documents"][0], results["distances"][0]):
    print(f"Text: {doc}\nDistance: {dist:.4f}\n")


🔍 Query Results:
Text: A luminous red plastic bag sits alone on a dark beach.
Distance: 7.4502

Text: A glowing lantern floats over the calm river.
Distance: 11.2442

Text: The ocean waves crash softly under the twilight sky.
Distance: 15.9738

