In [None]:
!pip install pandas numpy regex
!pip install chromadb
!pip install sentence-transformers
!pip install transformers torch

You should consider upgrading via the 'C:\Users\prash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\prash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\prash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\prash\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
# --------------------------
# 1️⃣ Install packages (run once in your environment)
# --------------------------
# pip install pandas sentence-transformers transformers torch chromadb

import pandas as pd
import re
import json
import numpy as np
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# --------------------------
# 2️⃣ Load and clean your dataset
# --------------------------
with open("ipc_constitution.json", "r", encoding="utf-8") as f:
    law_data = json.load(f)

df = pd.DataFrame(law_data)

def clean_text(t):
    if pd.isna(t):
        return ""
    s = str(t)
    s = s.replace("\n", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

df["context"] = df["document"].apply(clean_text)
print(f"✅ Laws dataset loaded. Number of rows: {len(df)}")

✅ Laws dataset loaded. Number of rows: 35


In [4]:
# --------------------------
# 3️⃣ Chunk the text
# --------------------------
def chunk_text(text, max_chars=800, overlap=150):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_chars, len(text))
        chunks.append(text[start:end].strip())
        if end == len(text):
            break
        start = end - overlap
    return chunks

all_chunks, all_ids, all_metadata = [], [], []

for idx, row in df.iterrows():
    if not isinstance(row["context"], str) or not row["context"].strip():
        continue

    chunks = chunk_text(row["context"])
    for j, chunk in enumerate(chunks):
        all_chunks.append(chunk)
        all_ids.append(f"{row['id']}_chunk{j}")
        # include type in metadata
        all_metadata.append({**row["metadata"], "chunk": j, "type": row.get("type", "")})

print("✅ Total text chunks prepared:", len(all_chunks))

✅ Total text chunks prepared: 35


In [5]:
# --------------------------
# 4️⃣ Create embeddings
# --------------------------
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedder.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.75it/s]


In [6]:
# --------------------------
# 5️⃣ Upload to ChromaDB
# --------------------------
# Convert lists in metadata to comma-separated strings
for meta in all_metadata:
    for k, v in meta.items():
        if isinstance(v, list):
            meta[k] = ", ".join(map(str, v))

# Initialize ChromaDB client
client = chromadb.CloudClient(
  api_key='ck-8ogrFSdCtYH9Bxh9mZ7A4dMsGbdQ2zp3VrLKNeHrfirE',
  tenant='f9ac33d4-f162-4b85-b38d-1c8b719366e6',
  database='VECDB'
)

collection_name = "indian_laws"
if collection_name in [c.name for c in client.list_collections()]:
    collection = client.get_collection(name=collection_name)
else:
    collection = client.create_collection(name=collection_name)

# Batch upload
MAX_RECORDS = 300
for i in range(0, len(all_chunks), MAX_RECORDS):
    batch_chunks = all_chunks[i:i+MAX_RECORDS]
    batch_embeddings = embeddings[i:i+MAX_RECORDS].tolist()
    batch_ids = all_ids[i:i+MAX_RECORDS]
    batch_metadatas = all_metadata[i:i+MAX_RECORDS]

    collection.add(
        documents=batch_chunks,
        embeddings=batch_embeddings,
        ids=batch_ids,
        metadatas=batch_metadatas
    )
    print(f"✅ Added batch {i//MAX_RECORDS + 1}, size: {len(batch_chunks)}")

print("✅ All law chunks added to ChromaDB!")

✅ Added batch 1, size: 35
✅ All law chunks added to ChromaDB!


In [7]:
# --------------------------
# 6️⃣ Load FLAN-T5 for QA
# --------------------------
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = 0 if torch.cuda.is_available() else -1

qa_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device,
    framework="pt"
)

Device set to use cpu


In [9]:
# --------------------------
# 7️⃣ Retrieval + Answering
# --------------------------
def retrieve(query, top_k=5):
    results = collection.query(query_texts=[query], n_results=top_k)
    docs = results.get("documents", [[]])[0]
    metas = results.get("metadatas", [[]])[0]
    return list(zip(docs, metas))

def answer_question(question, top_k=5):
    retrieved = retrieve(question, top_k)
    if not retrieved:
        return "No relevant context found.", []

    context = "\n\n".join([doc for doc, _ in retrieved])
    prompt = f"Answer the following legal question using the provided context.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer clearly and concisely:"
    
    result = qa_pipeline(prompt, max_length=300, do_sample=False)
    return result[0]["generated_text"], retrieved

In [10]:
# --------------------------
# 8️⃣ Interactive QA loop
# --------------------------
print("\n✅ Legal QA system ready! Type your question (type 'exit' to quit).")
while True:
    q = input("\nQuestion: ")
    if q.lower() in ["exit", "quit"]:
        break
    ans, ctx = answer_question(q)

    print("\n🧑‍⚖️ Answer:\n", ans)
    print("\n📚 Sources:")
    for doc, meta in ctx:
        ref = meta.get("source", "Unknown Source")
        act = meta.get("act", "")
        sec = meta.get("section", meta.get("article", ""))
        print(f"- {ref} ({act} {sec})")


✅ Legal QA system ready! Type your question (type 'exit' to quit).

🧑‍⚖️ Answer:
 IPC Section 307

📚 Sources:
- IPC Bare Act (Indian Penal Code 302)
- IPC Bare Act (Indian Penal Code 307)
- IPC Section 94 (Indian Penal Code 94)
- IPC Bare Act (Indian Penal Code 120B)
- IPC Section 100 (Indian Penal Code 100)
