In [1]:
!pip install chromadb
!pip install sentence-transformers
!pip install transformers torch



You should consider upgrading via the 'C:\Users\prash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\prash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\prash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
# 1️⃣ Install required packages (run once in your environment)
# pip install pandas sentence-transformers transformers torch chromadb

import pandas as pd
import re
import json
import numpy as np
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm





In [5]:
# --------------------------
# 2️⃣ Load and clean ipc_qa.json
# --------------------------
with open("ipc_qa.json", "r", encoding="utf-8") as f:
    ipc_data = json.load(f)

df_ipc = pd.DataFrame(ipc_data)

def clean_text(t):
    if pd.isna(t):
        return ""
    s = str(t)
    s = s.replace("\n", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

# Combine question + answer for context
df_ipc["context"] = df_ipc["question"].apply(clean_text) + " " + df_ipc["answer"].apply(clean_text)
df_ipc = df_ipc[df_ipc["context"].str.len() > 30]

print(f"✅ IPC QA loaded. Number of rows: {len(df_ipc)}")

✅ IPC QA loaded. Number of rows: 2267


In [6]:
# --------------------------
# 3️⃣ Chunk the text
# --------------------------
def chunk_text(text, max_chars=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chars, len(text))
        chunks.append(text[start:end])
        if end == len(text):
            break
        start = end - overlap
    return chunks

all_chunks = []
for row in df_ipc["context"]:
    all_chunks.extend(chunk_text(row))

print("Total IPC chunks:", len(all_chunks))

Total IPC chunks: 2268


In [7]:
# --------------------------
# 4️⃣ Embed chunks
# --------------------------
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedder.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)  # normalize

# Optional: save locally
with open("ipc_chunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)
np.save("ipc_embeddings.npy", embeddings)
print("✅ IPC chunks and embeddings saved!")

Batches: 100%|██████████| 71/71 [00:15<00:00,  4.72it/s]

✅ IPC chunks and embeddings saved!





In [8]:
# --------------------------
# 5️⃣ Upload to the same ChromaDB collection
# --------------------------
client = chromadb.CloudClient(
    api_key='ck-emz96PVNz8qGF5bddaf6tPTjepjigPpzNJNDTXr46nv',
    tenant='341f973c-899b-4201-b5f0-87312864f14a',
    database='VECDB'
)

collection_name = "constitution_articles"
if collection_name in [c.name for c in client.list_collections()]:
    collection = client.get_collection(name=collection_name)
else:
    collection = client.create_collection(name=collection_name)

# Free-tier safe: only upload up to 300 records at once
MAX_RECORDS = 300
for i in range(0, len(all_chunks), MAX_RECORDS):
    batch_chunks = all_chunks[i:i+MAX_RECORDS]
    batch_embeddings = embeddings[i:i+MAX_RECORDS].tolist()
    batch_ids = [f"ipc_{i+j}" for j in range(len(batch_chunks))]
    batch_metadatas = [{"source": f"ipc_chunk_{i+j}"} for j in range(len(batch_chunks))]

    collection.add(
        documents=batch_chunks,
        embeddings=batch_embeddings,
        ids=batch_ids,
        metadatas=batch_metadatas
    )
    print(f"✅ Added IPC batch {i//MAX_RECORDS + 1}, size: {len(batch_chunks)}")

print("✅ All IPC chunks added to ChromaDB!")

✅ Added IPC batch 1, size: 300
✅ Added IPC batch 2, size: 300
✅ Added IPC batch 3, size: 300
✅ Added IPC batch 4, size: 300
✅ Added IPC batch 5, size: 300
✅ Added IPC batch 6, size: 300
✅ Added IPC batch 7, size: 300
✅ Added IPC batch 8, size: 168
✅ All IPC chunks added to ChromaDB!


In [9]:
# --------------------------
# 6️⃣ Load FLAN-T5 for QA
# --------------------------
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = 0 if torch.cuda.is_available() else -1

qa_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device,
    framework="pt"
)

Device set to use cpu


In [10]:
# --------------------------
# 7️⃣ Query functions
# --------------------------
def retrieve(query, top_k=5):
    results = collection.query(query_texts=[query], n_results=top_k)
    docs = results.get('documents', [[]])[0]  # safe access
    if not docs:
        return ["No relevant context found."]
    return docs

def answer_question(question, top_k=5):
    retrieved = retrieve(question, top_k)
    context = "\n\n".join(retrieved)
    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
    result = qa_pipeline(prompt, max_length=256, do_sample=False)
    return result[0]["generated_text"], retrieved

In [11]:
# --------------------------
# 8️⃣ Interactive QA loop
# --------------------------
print("\n✅ QA system ready! Type your question (type 'exit' to quit).")
while True:
    q = input("\nQuestion: ")
    if q.lower() in ["exit", "quit"]:
        break
    ans, ctx = answer_question(q)
    print("\nAnswer:", ans)
    print("\n--- Retrieved Context ---")
    for c in ctx:
        print(c[:300], "...\n")


✅ QA system ready! Type your question (type 'exit' to quit).

Answer: The Indian Penal Code

--- Retrieved Context ---
What is the act referred to in the text that has the full title of 'The Indian Penal Code'? The Act mentioned in the text ...

What is the full title of the Act mentioned in the text? The Indian Penal Code ...

What is the extent of operation for the Indian Penal Code? The whole of India except the State of Jammu and Kashmir ...

What is the title and extent of operation of the Indian Penal Code? The title is 'The Indian Penal Code' and its operation extends to the punishment of offences committed within India, and beyond but which by law may be tried within India. It also includes extension of the Code to extra-territorial  ...

When was the Indian Penal Code enacted? 6th October, 1860 ...



Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors



Answer: Which of the following is true of the person who abetted the offence defined in section 161?

--- Retrieved Context ---
What are some offences under this section related to obscene objects according to the law? Selling, letting to hire, distributing, publicly exhibiting, or any manner of putting into circulation any obscene object, as well as importing, exporting, or conveying any obscene object for the mentioned pur ...

What does the illustration under section 161 imply? The illustration implies that if a person offers a bribe to a public servant for a favour in their official functions and the public servant accepts the bribe, the person has abetted the offence defined in section 161. ...

akes, rivers and wild life, and to have compassion for living creatures; (h) to develop the scientific temper, humanism and the spirit of inquiry and reform; (i) to safeguard public property and to abjure violence; (j) to strive towards excellence in all spheres of individual and collecti