# Database Updater for BNS Data

This notebook updates the ChromaDB vector store with new legal documents from the `data/bns_data` directory.

## Files to be processed:
- `bns_2024.pdf` - Bharatiya Nyaya Sanhita 2024
- `bnss_2024.pdf` - Bharatiya Nagarik Suraksha Sanhita 2024
- `bsa_2024.pdf` - Bharatiya Sakshya Adhiniyam 2024

## Step 1: Import Required Libraries

In [1]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Load environment variables
load_dotenv()

print("Libraries imported successfully!")

Libraries imported successfully!


## Step 2: Set Paths and Initialize Embeddings

In [2]:
# Set paths
BNS_DATA_DIR = "data/bns_data"
CHROMA_DIR = "chroma_db"

# Initialize embeddings model
def get_embeddings_model():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )
    return embeddings

embeddings = get_embeddings_model()
print(f"Embeddings model initialized: {embeddings.model_name}")
print(f"BNS Data directory: {BNS_DATA_DIR}")
print(f"ChromaDB directory: {CHROMA_DIR}")

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Embeddings model initialized: sentence-transformers/all-MiniLM-L6-v2
BNS Data directory: data/bns_data
ChromaDB directory: chroma_db


## Step 3: Check Available Files

In [3]:
# Check what files are available in bns_data directory
if os.path.exists(BNS_DATA_DIR):
    bns_files = [f for f in os.listdir(BNS_DATA_DIR) if f.endswith('.pdf')]
    print(f"Found {len(bns_files)} PDF files in {BNS_DATA_DIR}:")
    for file in bns_files:
        file_path = os.path.join(BNS_DATA_DIR, file)
        file_size = os.path.getsize(file_path) / (1024 * 1024)  # Size in MB
        print(f"  - {file} ({file_size:.2f} MB)")
else:
    print(f"Directory {BNS_DATA_DIR} not found!")
    bns_files = []

Found 4 PDF files in data/bns_data:
  - bnss_2024.pdf (1.94 MB)
  - bns_2024.pdf (1.26 MB)
  - bsa_2024.pdf (0.64 MB)
  - penal_code_India.pdf (1.05 MB)


## Step 4: Load and Process BNS Documents

In [4]:
def load_bns_documents():
    """Load PDF documents from BNS data directory"""
    documents = []
    
    for file in bns_files:
        file_path = os.path.join(BNS_DATA_DIR, file)
        try:
            print(f"Loading {file}...")
            loader = PyPDFLoader(file_path)
            loaded_docs = loader.load()
            
            # Add metadata to identify BNS documents
            for doc in loaded_docs:
                if not doc.metadata:
                    doc.metadata = {}
                doc.metadata["source_type"] = "bns_2024"
                doc.metadata["priority"] = "high"
                doc.metadata["document_category"] = "new_criminal_laws"
            
            documents.extend(loaded_docs)
            print(f"  Loaded {len(loaded_docs)} pages from {file}")
            
        except Exception as e:
            print(f"  Error loading {file}: {e}")
    
    return documents

# Load BNS documents
bns_documents = load_bns_documents()
print(f"\nTotal BNS document pages loaded: {len(bns_documents)}")

Loading bnss_2024.pdf...
  Loaded 249 pages from bnss_2024.pdf
Loading bns_2024.pdf...
  Loaded 102 pages from bns_2024.pdf
Loading bsa_2024.pdf...
  Loaded 47 pages from bsa_2024.pdf
Loading penal_code_India.pdf...
  Loaded 119 pages from penal_code_India.pdf

Total BNS document pages loaded: 517


## Step 5: Split Documents into Chunks

In [5]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into chunks for embedding"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks")
    
    return chunks

# Split BNS documents with smaller chunks for better retrieval
bns_chunks = split_documents(bns_documents, chunk_size=800, chunk_overlap=150)

# Give BNS documents high priority by duplicating chunks (2x representation)
weighted_bns_chunks = bns_chunks * 2
print(f"Created {len(weighted_bns_chunks)} weighted BNS chunks (2x representation)")

Split 517 documents into 3070 chunks
Created 6140 weighted BNS chunks (2x representation)


## Step 6: Load Existing Vector Store

In [6]:
# Load existing vector store
if os.path.exists(CHROMA_DIR):
    print("Loading existing vector store...")
    vector_store = Chroma(
        persist_directory=CHROMA_DIR,
        embedding_function=embeddings
    )
    
    # Check current collection size
    collection = vector_store._collection
    current_count = collection.count()
    print(f"Current vector store contains: {current_count} documents")
else:
    print(f"Vector store directory {CHROMA_DIR} not found!")
    print("Please run the main ingestion process first.")
    vector_store = None

Loading existing vector store...
Current vector store contains: 21004 documents


## Step 7: Add BNS Documents to Vector Store

In [7]:
if vector_store and weighted_bns_chunks:
    print("Adding BNS documents to existing vector store...")
    
    try:
        # Add the new BNS chunks to existing vector store
        vector_store.add_documents(weighted_bns_chunks)
        
        # Check updated collection size
        updated_count = vector_store._collection.count()
        added_count = updated_count - current_count
        
        print(f"Successfully added {added_count} BNS document chunks!")
        print(f"Updated vector store now contains: {updated_count} documents")
        
    except Exception as e:
        print(f"Error adding documents to vector store: {e}")
else:
    print("Cannot update vector store - either vector store not found or no BNS documents to add.")

Adding BNS documents to existing vector store...
Error adding documents to vector store: ValueError: Batch size of 6140 is greater than max batch size of 5461


## Step 8: Test Retrieval with BNS Content

In [8]:
if vector_store:
    # Test queries related to new criminal laws
    test_queries = [
        "What is Bharatiya Nyaya Sanhita?",
        "BNS 2024 provisions",
        "Bharatiya Nagarik Suraksha Sanhita",
        "New criminal laws India 2024"
    ]
    
    # Create retriever
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    
    print("Testing retrieval with BNS-related queries:\n")
    
    for query in test_queries:
        print(f"Query: {query}")
        docs = retriever.invoke(query)
        
        # Count BNS documents in results
        bns_count = sum(1 for doc in docs if doc.metadata.get("source_type") == "bns_2024")
        
        print(f"  Retrieved {len(docs)} documents")
        print(f"  BNS 2024 documents: {bns_count}")
        print(f"  Other documents: {len(docs) - bns_count}")
        
        if docs and docs[0].metadata.get("source_type") == "bns_2024":
            print(f"  ✓ BNS document found in top result")
        print()
else:
    print("Cannot test retrieval - vector store not available.")

Testing retrieval with BNS-related queries:

Query: What is Bharatiya Nyaya Sanhita?
  Retrieved 5 documents
  BNS 2024 documents: 0
  Other documents: 5

Query: BNS 2024 provisions
  Retrieved 5 documents
  BNS 2024 documents: 0
  Other documents: 5

Query: Bharatiya Nagarik Suraksha Sanhita
  Retrieved 5 documents
  BNS 2024 documents: 0
  Other documents: 5

Query: New criminal laws India 2024
  Retrieved 5 documents
  BNS 2024 documents: 0
  Other documents: 5



## Step 9: Summary and Completion

In [9]:
print("=" * 60)
print("DATABASE UPDATE SUMMARY")
print("=" * 60)

if bns_files:
    print(f"✓ Processed {len(bns_files)} BNS PDF files:")
    for file in bns_files:
        print(f"  - {file}")
    
    print(f"\n✓ Created {len(bns_chunks)} document chunks")
    print(f"✓ Applied 2x weighting: {len(weighted_bns_chunks)} total chunks")
    
    if vector_store:
        print(f"✓ Successfully updated ChromaDB vector store")
        print(f"✓ Database now contains enhanced BNS 2024 content")
        print("\n🎉 Update completed successfully!")
        print("\nThe legal assistant chatbot now has access to:")
        print("  • Bharatiya Nyaya Sanhita 2024")
        print("  • Bharatiya Nagarik Suraksha Sanhita 2024")
        print("  • Bharatiya Sakshya Adhiniyam 2024")
    else:
        print("⚠️  Vector store not found - please run main ingestion first")
else:
    print("⚠️  No BNS files found in data/bns_data directory")

print("\n" + "=" * 60)

DATABASE UPDATE SUMMARY
✓ Processed 4 BNS PDF files:
  - bnss_2024.pdf
  - bns_2024.pdf
  - bsa_2024.pdf
  - penal_code_India.pdf

✓ Created 3070 document chunks
✓ Applied 2x weighting: 6140 total chunks
✓ Successfully updated ChromaDB vector store
✓ Database now contains enhanced BNS 2024 content

🎉 Update completed successfully!

The legal assistant chatbot now has access to:
  • Bharatiya Nyaya Sanhita 2024
  • Bharatiya Nagarik Suraksha Sanhita 2024
  • Bharatiya Sakshya Adhiniyam 2024

