In [1]:
# Step 1: Load chunks
import pickle

with open("chunks.pkl", "rb") as f:
    chunks = pickle.load(f)

print(f"Loaded {len(chunks)} chunks.")

Loaded 481 chunks.


In [2]:
# Step 2: Generate embeddings using HuggingFace (e.g. all-MiniLM)
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(chunks, show_progress_bar=True)

print(f"Generated {len(embeddings)} embeddings.")

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Generated 481 embeddings.


In [4]:
import chromadb

# Connect to the running ChromaDB server
chroma_client = chromadb.HttpClient(host="localhost", port=8000)

# Create or load collection
collection = chroma_client.get_or_create_collection(name="ndr_chunks")

# IDs must be strings and unique
ids = [f"chunk-{i}" for i in range(len(chunks))]

# Make sure `embeddings` is a list of list of floats
collection.add(
    documents=chunks,
    embeddings=embeddings.tolist(),  # Convert from numpy to list
    ids=ids
)

print("✅ Chunks + embeddings stored in persistent ChromaDB.")

✅ Chunks + embeddings stored in persistent ChromaDB.


In [9]:
from sentence_transformers import SentenceTransformer
import chromadb

# 1. Load the same embedding model used for chunking
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 2. Define your query
query_text = "TABLE OF BENEFITS FOR DOMESTIC COVER"

# 3. Embed the query (result is a numpy array)
query_embedding = embedding_model.encode(query_text)

# 4. Connect to ChromaDB
chroma_client = chromadb.HttpClient(host="localhost", port=8000)

# 5. Load your persistent collection
collection = chroma_client.get_collection(name="ndr_chunks")

# 6. Perform semantic search
results = collection.query(
    query_embeddings=[query_embedding.tolist()],  # Wrapped in a list
    n_results=5,
    include=["documents", "distances"]  # Valid keys only
)

# 7. Display results
for i, doc in enumerate(results["documents"][0]):
    print(f"\n🔍 Result {i+1}")
    print(f"🆔 ID: {results['ids'][0][i]}")
    print(f"📏 Distance: {results['distances'][0][i]:.4f}")
    print(f"📄 Document:\n{doc}")
    print("-" * 60)


🔍 Result 1
🆔 ID: chunk-120
📏 Distance: 0.8198
📄 Document:
Issuing Office: 
 
GLOBAL HEALTH CARE 
 
 
 
TABLE OF BENEFITS FOR DOMESTIC COVER 
COVER IMPERIAL PLAN IMPERIAL PLUS PLAN 
In-patient Hospitalization 
Treatment Limits 
INR 
3,750,000   
INR 
5,600,000  
INR 
7,500,000  
INR  
11,200,000  
INR 
18,750,000  
INR 
37,500,000  
In-patient Hospitalization 
Treatment Up to Sum Insured 
Hospital accommodation  
(Room rent and ICU) At Actual 
Pre-hospitalisation 60 days 
Post-hospitalisation 180 days 
Local (Road) Ambulance Up to Sum Insured
------------------------------------------------------------

🔍 Result 2
🆔 ID: chunk-338
📏 Distance: 0.8393
📄 Document:
Benefits (Optional cover) and Our liability, if any, shall only be in excess of that sum.  
b. If opted, an aggregate Deductible as specified in the Policy Schedule will apply for expenses under Inpatient 
plan benefits outside India.  
 
29. Cumulative Bonus (For Domestic Cover only):  
If You renew Your Global Health Care Polic