In [1]:
!pip install chromadb sentence-transformers

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.35.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sdk-1.35.0-py3-none-any.whl.metadata (1.5 k

In [2]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

# Step 1: Initialize ChromaDB Client
# Removed the 'chroma_db_impl' setting from Settings
client = chromadb.Client(chromadb.config.Settings(
    persist_directory="./chromadb_data"  # Directory to store the database
))

# Step 2: Set up a ChromaDB Collection
collection_name = "semantic_search"
if collection_name not in client.list_collections():
    collection = client.create_collection(name=collection_name)
else:
    collection = client.get_collection(name=collection_name)

# Step 3: Embed and Insert Documents
model = SentenceTransformer("all-MiniLM-L6-v2")  # Pre-trained model for embeddings
documents = [
    "Milvus is a vector database.",
    "Semantic search is a powerful technique.",
    "Machine learning models can create embeddings.",
    "Vector search finds relevant documents."
]

# Generate embeddings for the documents
embeddings = model.encode(documents).tolist()

# Add documents to the collection
ids = [f"doc_{i}" for i in range(len(documents))]
metadata = [{"source": f"Document {i+1}"} for i in range(len(documents))]
collection.add(
    documents=documents,
    embeddings=embeddings,
    metadatas=metadata,
    ids=ids
)

# Step 4: Perform Semantic Search
query = "How does vector search work?"
query_embedding = model.encode([query]).tolist()[0]

# Search the collection for top 3 similar results
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

# Step 5: Display Results
print("Search Results:")
for i, (doc, score, meta) in enumerate(zip(results["documents"][0], results["distances"][0], results["metadatas"][0])):
    print(f"{i+1}. Text: {doc} | Score: {1 - score:.4f} | Source: {meta['source']}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Search Results:
1. Text: Vector search finds relevant documents. | Score: 0.3185 | Source: Document 4
2. Text: Semantic search is a powerful technique. | Score: -0.0947 | Source: Document 2
3. Text: Milvus is a vector database. | Score: -0.1120 | Source: Document 1


In [None]:
client.delete_collection(name=collection_name)