In [1]:
# ================================================================
# Bangladeshi Law RAG System - Step by Step Implementation
# ================================================================
# A comprehensive RAG system for querying Bangladeshi legal documents
# Using: GROQ Llama, Multilingual E5 Embeddings, Pinecone Vector DB
# ================================================================

# ================================================================
# STEP 1: Install Required Packages
# ================================================================
print("🚀 Installing required packages...")

%pip install -q langchain langchain-community
%pip install -q sentence-transformers
%pip install -q groq
%pip install -q pinecone-client
%pip install -q langchain-pinecone
%pip install -q langchain-groq
%pip install -q unstructured
%pip install -q numpy
%pip install -q torch

print("✅ All packages installed successfully!")

🚀 Installing required packages...
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.




Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
✅ All packages installed successfully!


In [None]:
# ================================================================
# STEP 2: Import Required Libraries
# ================================================================
print("\n📚 Importing required libraries...")

import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Core libraries
import numpy as np
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer

# LangChain components
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.embeddings.base import Embeddings
from langchain_groq import ChatGroq

# Vector database
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

print("✅ All libraries imported successfully!")


📚 Importing required libraries...
✅ All libraries imported successfully!


In [None]:
# pinecone_api=pcsk_34YrMp_DL17YVb7mMpx5GFLixVXGhyrXTorB5FsYTN5AwAWDaKAKkormw7DYMYdrXn8PjA
# GROQ_API_KEY=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

In [None]:
# ================================================================
# STEP 3: Set API Keys (Replace with your actual keys)
# ================================================================
print("\n🔑 Setting up API keys...")

# # Method 1: Direct assignment (replace with your actual keys)
# GROQ_API_KEY = "your_groq_api_key_here"
# PINECONE_API_KEY = "your_pinecone_api_key_here"

#Method 2: Using Colab secrets (recommended)
from google.colab import userdata
GROQ_API_KEY = userdata.get('GROQ_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

# Set environment variables
os.environ['GROQ_API_KEY'] = GROQ_API_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

print("✅ API keys configured!")


🔑 Setting up API keys...
✅ API keys configured!


In [None]:
# ================================================================
# STEP 4: Define Custom Embedding Class for Multilingual E5
# ================================================================
print("\n🧠 Setting up Multilingual E5 Embeddings...")

class MultilingualE5Embeddings(Embeddings):
    """
    Custom embedding class using multilingual-e5-large model
    Optimized for Bangla and English legal documents
    """

    def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
        print(f"Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name)
        print("✅ Embedding model loaded successfully!")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents with passage prefix"""
        # Add passage prefix for better retrieval performance
        prefixed_texts = [f"passage: {text}" for text in texts]
        embeddings = self.model.encode(prefixed_texts, normalize_embeddings=True)
        return embeddings.tolist()

    def embed_query(self, text: str) -> List[float]:
        """Embed a single query with query prefix"""
        # Add query prefix for better retrieval performance
        prefixed_text = f"query: {text}"
        embedding = self.model.encode(prefixed_text, normalize_embeddings=True)
        return embedding.tolist()

# Initialize embeddings
embeddings = MultilingualE5Embeddings()


🧠 Setting up Multilingual E5 Embeddings...
Loading embedding model: intfloat/multilingual-e5-large
✅ Embedding model loaded successfully!


In [None]:
# ================================================================
# STEP 5: Initialize GROQ LLM
# ================================================================
print("\n🤖 Setting up GROQ LLM...")

# Initialize the GROQ LLM with Llama model
llm = ChatGroq(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0.1,  # Low temperature for legal accuracy
    max_tokens=1024,
    timeout=30,
    max_retries=2
)

print("✅ GROQ LLM initialized successfully!")


🤖 Setting up GROQ LLM...
✅ GROQ LLM initialized successfully!


In [None]:
# # ================================================================
# # STEP 6: Setup Pinecone Vector Database
# # ================================================================
# print("\n🗄️ Setting up Pinecone Vector Database...")

# # Initialize Pinecone client
# pc = Pinecone(api_key=PINECONE_API_KEY)

# # Define index name
# index_name = "bangladeshi-law-rag"

# # Check if index exists, create if not
# existing_indexes = pc.list_indexes().names()
# if index_name not in existing_indexes:
#     print(f"Creating new Pinecone index: {index_name}")
#     pc.create_index(
#         name=index_name,
#         dimension=1024,  # multilingual-e5-large dimension
#         metric="cosine",
#         spec=ServerlessSpec(
#             cloud="aws",
#             region="us-east-1"
#         )
#     )
#     print("✅ New Pinecone index created!")
# else:
#     print(f"✅ Using existing Pinecone index: {index_name}")


🗄️ Setting up Pinecone Vector Database...
✅ Using existing Pinecone index: bangladeshi-law-rag


In [None]:
# ================================================================
# STEP 7: Define URLs for Bangladeshi Legal Documents
# ================================================================
print("\n📋 Defining URLs for legal documents...")

# Example URLs - Replace with actual Bangladeshi legal document URLs
BANGLADESHI_LAW_URLS = [

"http://bdlaws.minlaw.gov.bd//act-1429/part-536.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51793.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51794.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51795.html",
"http://bdlaws.minlaw.gov.bd//act-1429/part-537.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51796.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51797.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51798.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51799.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51800.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51801.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51802.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51803.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51804.html",
"http://bdlaws.minlaw.gov.bd//act-1429/part-538.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51805.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51806.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51807.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51808.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51809.html",
"http://bdlaws.minlaw.gov.bd//act-1429/part-539.html",
"http://bdlaws.minlaw.gov.bd//act-1429/chapter-2333.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51810.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51811.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51812.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51813.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51814.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51815.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51816.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51817.html",
"http://bdlaws.minlaw.gov.bd//act-1429/chapter-2334.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51818.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51819.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51820.html",
"http://bdlaws.minlaw.gov.bd//act-1429/part-540.html",
"http://bdlaws.minlaw.gov.bd//act-1429/chapter-2335.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51821.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51822.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51823.html",
"http://bdlaws.minlaw.gov.bd//act-1429/chapter-2336.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51824.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51825.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51826.html",
"http://bdlaws.minlaw.gov.bd//act-1429/chapter-2337.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51827.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51828.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51829.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51830.html",
"http://bdlaws.minlaw.gov.bd//act-1429/section-51831.html",
"http://bdlaws.minlaw.gov.bd//act-1429/chapter-2338.html",
]


print(f"📄 Configured {len(BANGLADESHI_LAW_URLS)} URLs for document scraping")


📋 Defining URLs for legal documents...
📄 Configured 50 URLs for document scraping


In [None]:
# # ================================================================
# # STEP 8: Load and Process Legal Documents
# # ================================================================
# print("\n📥 Loading and processing legal documents...")

# # Load documents from URLs
# print("Loading documents from URLs...")
# loader = UnstructuredURLLoader(urls=BANGLADESHI_LAW_URLS)
# documents = loader.load()

# print(f"✅ Loaded {len(documents)} documents")

# # Split documents into chunks
# print("Splitting documents into chunks...")


📥 Loading and processing legal documents...
Loading documents from URLs...




✅ Loaded 50 documents
Splitting documents into chunks...


In [None]:
# text_splitter = CharacterTextSplitter(
#     separator='\n',
#     chunk_size=1000,  # Smaller chunks for legal documents
#     chunk_overlap=200,  # Overlap to maintain context
#     length_function=len
# )

# text_chunks = text_splitter.split_documents(documents)
# print(f"✅ Split into {len(text_chunks)} chunks")

# # Display sample chunk
# if text_chunks:
#     print(f"\n📄 Sample chunk preview:")
#     print("=" * 50)
#     print(text_chunks[0].page_content[:300] + "...")
#     print("=" * 50)

In [None]:
# !pip install langchain unstructured[all-docs] requests retrying
# !pip install unstructured

In [None]:
# import logging
# import urllib.parse
# import requests
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# # from unstructured.documents.html import HTMLDocument
# from unstructured.cleaners.core import clean
# from concurrent.futures import ThreadPoolExecutor
# from retrying import retry

# # Set up logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logger = logging.getLogger(__name__)

# print("\n📥 Loading and processing legal documents...")

# # Fix URL formatting and validate URLs
# def normalize_url(url):
#     # Remove double slashes and normalize URL
#     parsed = urllib.parse.urlparse(url)
#     path = urllib.parse.urljoin(parsed.path, '')
#     normalized = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, path, '', '', ''))
#     return normalized

# def validate_urls(urls):
#     valid_urls = []
#     failed_urls = []
#     for url in urls:
#         normalized_url = normalize_url(url)
#         try:
#             response = requests.head(normalized_url, timeout=10, allow_redirects=True)
#             if response.status_code == 200:
#                 valid_urls.append(normalized_url)
#                 logger.info(f"Valid URL: {normalized_url}")
#             else:
#                 logger.warning(f"Invalid URL: {normalized_url} (Status: {response.status_code})")
#                 failed_urls.append(normalized_url)
#         except requests.RequestException as e:
#             logger.error(f"Failed to validate URL: {normalized_url} ({e})")
#             failed_urls.append(normalized_url)
#     return valid_urls, failed_urls

# # Normalize and validate URLs
# # Placeholder: Replace with your actual BANGLADESHI_LAW_URLS list
# BANGLADESHI_LAW_URLS = [
#     "http://bdlaws.minlaw.gov.bd//",
#     "http://bdlaws.minlaw.gov.bd//laws-of-bangladesh-chronological-index.html",
#     "http://bdlaws.minlaw.gov.bd//laws-of-bangladesh-alphabetical-index.html",
#     "http://bdlaws.minlaw.gov.bd//search.html",
#     "http://bdlaws.minlaw.gov.bd//related-links.html",
#     "http://bdlaws.minlaw.gov.bd//contact-us.html",
#     "http://old.bdlaws.minlaw.gov.bd/",
#     "http://bdlaws.minlaw.gov.bd//how-to-search.html",
#     "http://bdlaws.minlaw.gov.bd//how-to-print.html",
#     "http://bdlaws.minlaw.gov.bd//glossary.html",
#     "http://bdlaws.minlaw.gov.bd//roman-number.html",
#     "http://bdlaws.minlaw.gov.bd//feedback-suggestion.html",
#     "http://bdlaws.minlaw.gov.bd/?lang=bn",
#     "http://bdlaws.minlaw.gov.bd/?lang=en",
#     "http://bdlaws.minlaw.gov.bd//volume-53.html",
#     "http://bdlaws.minlaw.gov.bd//chronological-index-acts.html",
#     "http://bdlaws.minlaw.gov.bd//volume-1.html",
#     "http://bdlaws.minlaw.gov.bd//volume-2.html",
#     "http://bdlaws.minlaw.gov.bd//volume-3.html",
#     "http://bdlaws.minlaw.gov.bd//volume-4.html",
#     "http://bdlaws.minlaw.gov.bd//volume-5.html",
#     "http://bdlaws.minlaw.gov.bd//volume-6.html",
#     "http://bdlaws.minlaw.gov.bd//volume-7.html",
#     "http://bdlaws.minlaw.gov.bd//volume-8.html",
#     "http://bdlaws.minlaw.gov.bd//volume-9.html",
#     "http://bdlaws.minlaw.gov.bd//volume-10.html",
#     "http://bdlaws.minlaw.gov.bd//volume-11.html",
#     "http://bdlaws.minlaw.gov.bd//volume-12.html",
#     "http://bdlaws.minlaw.gov.bd//volume-13.html",
#     "http://bdlaws.minlaw.gov.bd//volume-14.html",
#     "http://bdlaws.minlaw.gov.bd//volume-15.html",
#     "http://bdlaws.minlaw.gov.bd//volume-16.html",
#     "http://bdlaws.minlaw.gov.bd//volume-17.html",
#     "http://bdlaws.minlaw.gov.bd//volume-18.html",
#     "http://bdlaws.minlaw.gov.bd//volume-19.html",
#     "http://bdlaws.minlaw.gov.bd//volume-20.html",
#     "http://bdlaws.minlaw.gov.bd//volume-21.html",
#     "http://bdlaws.minlaw.gov.bd//volume-22.html",
#     "http://bdlaws.minlaw.gov.bd//volume-23.html",
#     "http://bdlaws.minlaw.gov.bd//volume-24.html",
#     "http://bdlaws.minlaw.gov.bd//volume-25.html",
#     "http://bdlaws.minlaw.gov.bd//volume-26.html",
#     "http://bdlaws.minlaw.gov.bd//volume-27.html",
#     "http://bdlaws.minlaw.gov.bd//volume-28.html",
#     "http://bdlaws.minlaw.gov.bd//volume-29.html",
#     "http://bdlaws.minlaw.gov.bd//volume-30.html",
#     "http://bdlaws.minlaw.gov.bd//volume-31.html",
#     "http://bdlaws.minlaw.gov.bd//volume-32.html",
#     "http://bdlaws.minlaw.gov.bd//volume-33.html",
#     "http://bdlaws.minlaw.gov.bd//volume-34.html",
#     "http://bdlaws.minlaw.gov.bd//volume-35.html",
#     "http://bdlaws.minlaw.gov.bd//volume-36.html",
#     "http://bdlaws.minlaw.gov.bd//volume-37.html",
#     "http://bdlaws.minlaw.gov.bd//volume-38.html",
#     "http://bdlaws.minlaw.gov.bd//volume-39.html",
#     "http://bdlaws.minlaw.gov.bd//volume-40.html",
#     "http://bdlaws.minlaw.gov.bd//volume-41.html",
#     "http://bdlaws.minlaw.gov.bd//volume-42.html",
#     "http://bdlaws.minlaw.gov.bd//volume-43.html",
#     "http://bdlaws.minlaw.gov.bd//volume-44.html",
#     "http://bdlaws.minlaw.gov.bd//volume-45.html",
#     "http://bdlaws.minlaw.gov.bd//volume-46.html",
#     "http://bdlaws.minlaw.gov.bd//volume-47.html",
#     "http://bdlaws.minlaw.gov.bd//volume-48.html",
#     "http://bdlaws.minlaw.gov.bd//volume-49.html",
#     "http://bdlaws.minlaw.gov.bd//volume-50.html",
#     "http://bdlaws.minlaw.gov.bd//volume-51.html",
#     "http://bdlaws.minlaw.gov.bd//volume-52.html",
#     "http://bdlaws.minlaw.gov.bd//volume-54.html",
#     "http://bdlaws.minlaw.gov.bd//volume-55.html",
#     "http://bdlaws.minlaw.gov.bd//volume-56.html",
#     "http://bdlaws.minlaw.gov.bd//act-details-1429.html",
#     "http://bdlaws.minlaw.gov.bd/1",
#     "http://bdlaws.minlaw.gov.bd/2",
#     "http://bdlaws.minlaw.gov.bd//act-672.html",
#     "http://bdlaws.minlaw.gov.bd//act-957.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/part-536.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51793.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51794.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51795.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/part-537.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51796.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51797.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51798.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51799.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51800.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51801.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51802.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51803.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51804.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/part-538.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51805.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51806.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51807.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51808.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51809.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/part-539.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/chapter-2333.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51810.html",
#     "http://bdlaws.minlaw.gov.bd//act-1429/section-51811.html"
# ]
# BANGLADESHI_LAW_URLS = [normalize_url(url) for url in BANGLADESHI_LAW_URLS]
# valid_urls, failed_urls = validate_urls(BANGLADESHI_LAW_URLS)
# logger.info(f"Valid URLs: {len(valid_urls)}, Failed URLs: {len(failed_urls)}")

In [None]:
# # Load documents with retries
# @retry(stop_max_attempt_number=5, wait_fixed=10000)
# def load_single_url(url):
#     logger.info(f"Attempting to load URL: {url}")
#     loader = UnstructuredURLLoader(
#         urls=[url],
#         timeout=60,
#         languages=["bn", "en"],
#         headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
#     )
#     docs = loader.load()
#     logger.info(f"Loaded {len(docs)} documents from {url}")
#     return docs

# def load_documents(urls):
#     documents = []
#     newly_failed_urls = []
#     with ThreadPoolExecutor(max_workers=5) as executor:
#         results = executor.map(load_single_url, urls)
#         for url, result in zip(urls, results):
#             if result:
#                 documents.extend(result)
#             else:
#                 logger.error(f"No documents loaded from {url}")
#                 newly_failed_urls.append(url)
#     return documents, newly_failed_urls

# print("Loading documents from URLs...")
# documents, newly_failed_urls = load_documents(valid_urls)
# failed_urls.extend(newly_failed_urls)
# logger.info(f"Total documents loaded: {len(documents)}")
# print(f"✅ Loaded {len(documents)} documents")

# # Save failed URLs
# if failed_urls:
#     with open("failed_urls.txt", "w", encoding="utf-8") as f:
#         f.write("\n".join(failed_urls))
#     logger.info(f"Saved {len(failed_urls)} failed URLs to failed_urls.txt")

# # Process documents, preserving all content
# def process_documents(documents):
#     processed_docs = []
#     for doc in documents:
#         cleaned_text = clean(
#             doc.page_content,
#             extra_whitespace=True,
#             dashes=True,
#             bullets=True,
#             trailing_punctuation=True
#         )
#         doc.page_content = cleaned_text
#         doc.metadata["is_short_text"] = len(cleaned_text) < 50
#         if doc.metadata["is_short_text"]:
#             logger.info(f"Short text detected (length {len(cleaned_text)}): {cleaned_text[:30]}...")
#         processed_docs.append(doc)
#     return processed_docs

# documents = process_documents(documents)
# print(f"✅ Processed {len(documents)} documents")

# # Split documents into chunks with strict size enforcement
# def force_split_chunk(chunk, max_size=1000):
#     if len(chunk.page_content) <= max_size:
#         return [chunk]
#     new_chunks = []
#     text = chunk.page_content
#     start = 0
#     while start < len(text):
#         end = min(start + max_size, len(text))
#         separators = ["\n\n", "\n", ".", "!", "?", ",", " "]
#         for sep in separators:
#             if end < len(text):
#                 last_sep = text.rfind(sep, start, end)
#                 if last_sep > start:
#                     end = last_sep + len(sep)
#                     break
#         new_chunk = chunk.copy()
#         new_chunk.page_content = text[start:end]
#         new_chunks.append(new_chunk)
#         start = end
#     return new_chunks

# print("Splitting documents into chunks...")
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,
#     chunk_overlap=200,
#     length_function=len,
#     separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
#     keep_separator=True
# )

# text_chunks = []
# for doc in documents:
#     chunks = text_splitter.split_documents([doc])
#     for chunk in chunks:
#         text_chunks.extend(force_split_chunk(chunk, max_size=1000))

# logger.info(f"Total chunks created: {len(text_chunks)}")
# print(f"✅ Split into {len(text_chunks)} chunks")

# # Verify chunk sizes
# oversized_chunks = [chunk for chunk in text_chunks if len(chunk.page_content) > 1000]
# if oversized_chunks:
#     logger.warning(f"Found {len(oversized_chunks)} chunks larger than 1000 characters")
#     for i, chunk in enumerate(oversized_chunks[:3]):
#         logger.warning(f"Oversized chunk {i+1}: {len(chunk.page_content)} characters")
#         logger.warning(f"Content preview: {chunk.page_content[:300]}...")

# # Display sample chunk
# if text_chunks:
#     print(f"\n📄 Sample chunk preview:")
#     print("=" * 50)
#     print(text_chunks[0].page_content[:300] + "...")
#     print("=" * 50)

# # Save all chunks for verification
# with open("document_chunks.txt", "w", encoding="utf-8") as f:
#     for i, chunk in enumerate(text_chunks):
#         f.write(f"Chunk {i+1} (Length: {len(chunk.page_content)}, Short: {chunk.metadata.get('is_short_text', False)}):\n")
#         f.write(chunk.page_content + "\n")
#         f.write("=" * 50 + "\n")
# logger.info("Saved all chunks to document_chunks.txt")

In [None]:
# ================================================================
# STEP 9: Create Vector Store and Retriever
# ================================================================
print("\n🔍 Creating vector store and retriever...")

# Create vector store using Pinecone
print("Embedding documents and storing in Pinecone...")
vectorstore = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

# Setup retriever with similarity search
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}  # Return top 5 most relevant chunks
)

print("✅ Vector store and retriever created successfully!")


🔍 Creating vector store and retriever...
Embedding documents and storing in Pinecone...


NameError: name 'text_chunks' is not defined

In [None]:
# print("\n🔍 Creating vector store and retriever...")
# index_name="bangladeshi-law-rag"
# # Check if index already has documents
# stats = index_name.describe_index_stats()
# total_vectors = stats.get('total_vector_count', 0)

# print(f"Current vectors in index: {total_vectors}")

# # Option 1: Simple approach - only upload if index is empty
# if total_vectors == 0:
#     print("Index is empty. Embedding documents and storing in Pinecone...")
#     vectorstore = PineconeVectorStore.from_documents(
#         documents=text_chunks,
#         index_name=index_name,
#         embedding=embeddings
#     )
#     print("✅ Documents uploaded to Pinecone!")
# else:
#     print("Index already contains documents. Skipping upload...")
#     # Just create vectorstore connection to existing index
#     vectorstore = PineconeVectorStore(
#         index_name=index_name,
#         embedding=embeddings
#     )

# # Setup retriever with similarity search
# retriever = vectorstore.as_retriever(
#     search_type="similarity",
#     search_kwargs={"k": 5}  # Return top 5 most relevant chunks
# )

# print("✅ Vector store and retriever created successfully!")

In [None]:
# ================================================================
# STEP 6: Setup Pinecone Vector Database (Efficient Version)
# ================================================================
print("\n🗄️ Setting up Pinecone Vector Database...")

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define index name
index_name = "bangladeshi-law-rag"

# Check if index exists, create if not
existing_indexes = pc.list_indexes().names()
if index_name not in existing_indexes:
    print(f"Creating new Pinecone index: {index_name}")
    pc.create_index(
        name=index_name,
        dimension=1024,  # multilingual-e5-large dimension
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print("✅ New Pinecone index created!")
    index_is_new = True
else:
    print(f"✅ Using existing Pinecone index: {index_name}")
    index_is_new = False

# Get index reference
index = pc.Index(index_name)

# ================================================================
# STEP 9: Create Vector Store and Retriever (Efficient Version)
# ================================================================
print("\n🔍 Creating vector store and retriever...")

# Check if index already has documents
stats = index.describe_index_stats()
total_vectors = stats.get('total_vector_count', 0)

print(f"Current vectors in index: {total_vectors}")

# Option 1: Simple approach - only upload if index is empty
if total_vectors == 0:
    print("Index is empty. Embedding documents and storing in Pinecone...")
    vectorstore = PineconeVectorStore.from_documents(
        documents=text_chunks,
        index_name=index_name,
        embedding=embeddings
    )
    print("✅ Documents uploaded to Pinecone!")
else:
    print("Index already contains documents. Skipping upload...")
    # Just create vectorstore connection to existing index
    vectorstore = PineconeVectorStore(
        index_name=index_name,
        embedding=embeddings
    )

# Setup retriever with similarity search
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}  # Return top 5 most relevant chunks
)

print("✅ Vector store and retriever created successfully!")

In [None]:
# ================================================================
# STEP 10: Define Legal-Specific Prompt Template
# ================================================================
print("\n📝 Setting up legal-specific prompt template...")

# Define system prompt specialized for Bangladeshi legal queries
system_prompt = """
You are a highly specialized legal assistant for Bangladeshi law with deep expertise in the Bangladesh legal system.
Your role is to provide accurate, precise, and helpful responses to legal queries based strictly on the provided context.

Guidelines for responses:
1. **Accuracy**: Only provide information that is explicitly stated in the provided context
2. **Legal Precision**: Use proper legal terminology and cite relevant sections when available
3. **Bangladeshi Law Focus**: Prioritize information specific to Bangladesh legal system
4. **Clarity**: Explain complex legal concepts in clear, understandable language
5. **Structure**: Organize responses with clear headings and bullet points when appropriate
6. **Citations**: Reference specific legal provisions, sections, or acts when mentioned in context

When answering legal questions:
- Start with a direct answer if possible
- Provide relevant legal provisions or sections
- Explain the practical implications
- Distinguish between different types of legal documents (Constitution, Acts, Rules, etc.)
- Mention applicable courts or jurisdictions when relevant

Language Support:
- You must have to respond in Bangla as appropriate
- Maintain legal terminology accuracy Bangla language

If you cannot find specific information in the provided context, respond with:
"I don't have sufficient information in the provided legal documents to answer this specific question. Please consult official legal sources or a qualified legal professional for accurate information."

Context: {context}
"""

# Create prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

print("✅ Legal prompt template configured!")


📝 Setting up legal-specific prompt template...
✅ Legal prompt template configured!


In [None]:
# ================================================================
# STEP 11: Create RAG Chain
# ================================================================
print("\n🔗 Creating RAG chain...")

# Create document chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Create retrieval chain
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

print("✅ RAG chain created successfully!")


🔗 Creating RAG chain...
✅ RAG chain created successfully!


In [None]:
# ================================================================
# STEP 12: Test the System with Sample Queries
# ================================================================
print("\n🧪 Testing the system with sample queries...")

# Define test queries for Bangladeshi law
test_queries = [
    "What are the fundamental rights guaranteed by the Bangladesh Constitution?",
    "What is the procedure for filing a civil suit in Bangladesh?",
    "What are the penalties for theft under Bangladesh Penal Code?",
    "How to register a company in Bangladesh?",
    "What are the requirements for bail in criminal cases in Bangladesh?"
]

# Function to query the system
def query_legal_system(question: str) -> Dict[str, Any]:
    """
    Query the RAG system with a legal question

    Args:
        question: Legal question to ask

    Returns:
        Dictionary containing answer and source documents
    """
    print(f"\n🔍 Processing query: {question}")
    response = rag_chain.invoke({"input": question})

    return {
        "answer": response.get("answer", ""),
        "source_documents": response.get("context", []),
        "question": question
    }

# Function to get relevant documents without generating answer
def get_relevant_documents(query: str, k: int = 5) -> List[Any]:
    """
    Retrieve relevant documents for a query

    Args:
        query: Search query
        k: Number of documents to retrieve

    Returns:
        List of relevant documents
    """
    docs = retriever.invoke(query)
    return docs[:k]

# Test with sample queries
print("\n" + "="*80)
print("🎯 TESTING BANGLADESHI LAW RAG SYSTEM")
print("="*80)

# Test query function
sample_query = "What are the fundamental rights in Bangladesh Constitution?"
print(f"\n📋 Test Query: {sample_query}")

# Test document retrieval
relevant_docs = get_relevant_documents(sample_query, k=3)
print(f"✅ Retrieved {len(relevant_docs)} relevant documents")

# Test full RAG query
response = query_legal_system(sample_query)
print(f"\n📝 Generated Answer:")
print("-" * 50)
print(response['answer'])
print("-" * 50)


🧪 Testing the system with sample queries...

🎯 TESTING BANGLADESHI LAW RAG SYSTEM

📋 Test Query: What are the fundamental rights in Bangladesh Constitution?
✅ Retrieved 3 relevant documents

🔍 Processing query: What are the fundamental rights in Bangladesh Constitution?

📝 Generated Answer:
--------------------------------------------------
## মৌলিক অধিকার

বাংলাদেশের সংবিধানের তৃতীয় অংশে মৌলিক অধিকারের কথা বলা হয়েছে। 

### মৌলিক অধিকারের তালিকা

*   জীবনের অধিকার (ধারা ৩২): প্রত্যেক নাগরিকের জীবনের অধিকার রয়েছে।
*   সমতার অধিকার (ধারা ৩৩): সকল নাগরিক আইনের কাছে সমান।
*   স্বাধীনতা ও ব্যক্তি স্বাধীনতার অধিকার (ধারা ৩৪): প্রত্যেক নাগরিকের স্বাধীনতা ও ব্যক্তি স্বাধীনতার অধিকার রয়েছে।
*   সম্পত্তির অধিকার (ধারা ৩৫): প্রত্যেক নাগরিকের সম্পত্তির অধিকার রয়েছে।
*   নারীর অধিকার (ধারা ৩৬): নারীদের সমান অধিকার রয়েছে।
*   শিশুদের অধিকার (ধারা ৩৭): শিশুদের অধিকার রক্ষা করা হবে।
*   শিক্ষার অধিকার (ধারা ৩৮): প্রত্যেক নাগরিকের শিক্ষার অধিকার রয়েছে।
*   সংবাদপত্র, রেডিও, টেলিভিশন ও অন্যান্য 

In [None]:
print("\n🧪 Testing the system with sample queries...")



# Function to query the system
def query_legal_system(question: str) -> Dict[str, Any]:
    """
    Query the RAG system with a legal question

    Args:
        question: Legal question to ask

    Returns:
        Dictionary containing answer and source documents
    """
    print(f"\n🔍 Processing query: {question}")
    response = rag_chain.invoke({"input": question})

    return {
        "answer": response.get("answer", ""),
        "source_documents": response.get("context", []),
        "question": question
    }

# Function to get relevant documents without generating answer
def get_relevant_documents(query: str, k: int = 5) -> List[Any]:
    """
    Retrieve relevant documents for a query

    Args:
        query: Search query
        k: Number of documents to retrieve

    Returns:
        List of relevant documents
    """
    docs = retriever.invoke(query)
    return docs[:k]

# Test with sample queries
print("\n" + "="*80)
print("🎯 TESTING BANGLADESHI LAW RAG SYSTEM")
print("="*80)

# Get user input for the query
sample_query = input("Please enter a legal query related to Bangladeshi law: ")
print(f"\n📋 Test Query: {sample_query}")

# Test document retrieval
relevant_docs = get_relevant_documents(sample_query, k=3)
print(f"✅ Retrieved {len(relevant_docs)} relevant documents")

# Test full RAG query
response = query_legal_system(sample_query)
print(f"\n📝 Generated Answer:")
print("-" * 50)
print(response['answer'])
print("-" * 50)


🧪 Testing the system with sample queries...

🎯 TESTING BANGLADESHI LAW RAG SYSTEM
Please enter a legal query related to Bangladeshi law: bangladesh er law niye jante chai 

📋 Test Query: bangladesh er law niye jante chai 
✅ Retrieved 3 relevant documents

🔍 Processing query: bangladesh er law niye jante chai 

📝 Generated Answer:
--------------------------------------------------
বাংলাদেশের আইন সম্পর্কে জানতে চাইলে আমি আপনাকে বাংলাদেশের আইনী কাঠামো সম্পর্কে একটি সংক্ষিপ্ত ধারণা দিতে পারি।

বাংলাদেশের আইনী কাঠামো মূলত নিম্নলিখিত আইন ও বিধি দ্বারা গঠিত:

* **সংবিধান**: বাংলাদেশের সংবিধান হলো দেশের সর্বোচ্চ আইন। এটি ১৯৭২ সালে গৃহীত হয় এবং এটি বাংলাদেশের রাজনৈতিক, সামাজিক ও অর্থনৈতিক কাঠামোর রূপরেখা প্রদান করে।
* **আইন**: বাংলাদেশের আইনগুলি সংসদ দ্বারা গৃহীত হয়। এগুলি বিভিন্ন বিষয় যেমন ফৌজদারি আইন, দেওয়ানী আইন, বাণিজ্যিক আইন ইত্যাদি নিয়ন্ত্রণ করে।
* **বিধি**: বিধিগুলি আইনের বিধানগুলি কার্যকর করার জন্য তৈরি করা হয়। এগুলি মন্ত্রণালয়, বিভাগ বা কর্তৃপক্ষ দ্বারা তৈরি করা হয়।
* **প্রেসি

In [None]:
# ================================================================
# STEP 13: Interactive Query Function
# ================================================================
print("\n🎮 Setting up interactive query function...")

def interactive_legal_query():
    """
    Interactive function to query the legal system
    """
    print("\n" + "="*60)
    print("🏛️ BANGLADESHI LAW RAG SYSTEM - INTERACTIVE MODE")
    print("="*60)
    print("Ask any question about Bangladeshi law. Type 'exit' to quit.")

    while True:
        user_question = input("\n💬 Your legal question: ").strip()

        if user_question.lower() in ['exit', 'quit', 'bye']:
            print("👋 Thank you for using the Bangladeshi Law RAG System!")
            break

        if not user_question:
            print("❌ Please enter a valid question.")
            continue

        try:
            # Get response
            response = query_legal_system(user_question)

            print(f"\n📋 Question: {user_question}")
            print(f"📝 Answer:")
            print("-" * 60)
            print(response['answer'])
            print("-" * 60)
            print(f"📚 Based on {len(response['source_documents'])} legal documents")

        except Exception as e:
            print(f"❌ Error processing query: {e}")

In [None]:
# ================================================================
# STEP 14: Additional Helper Functions
# ================================================================
print("\n🛠️ Setting up additional helper functions...")

def search_legal_documents(query: str, num_results: int = 3):
    """
    Search for legal documents related to a query

    Args:
        query: Search query
        num_results: Number of results to return
    """
    print(f"\n🔍 Searching for: '{query}'")
    docs = get_relevant_documents(query, k=num_results)

    for i, doc in enumerate(docs, 1):
        print(f"\n📄 Result {i}:")
        print("-" * 40)
        print(doc.page_content[:300] + "...")
        print("-" * 40)
        if hasattr(doc, 'metadata') and doc.metadata:
            print(f"📍 Source: {doc.metadata}")

def get_system_stats():
    """
    Display system statistics
    """
    print("\n📊 SYSTEM STATISTICS")
    print("=" * 50)
    print(f"📚 Total documents loaded: {len(documents)}")
    print(f"📄 Total text chunks: {len(text_chunks)}")
    print(f"🗄️ Vector database: Pinecone ({index_name})")
    print(f"🧠 Embedding model: multilingual-e5-large")
    print(f"🤖 LLM: GROQ Llama-3.1-70b-versatile")
    print(f"🔍 Retrieval: Top-5 similarity search")
    print("=" * 50)

In [None]:
# ================================================================
# STEP 15: Final System Summary
# ================================================================
print("\n🎉 SYSTEM SETUP COMPLETE!")
print("=" * 80)
print("✅ Bangladeshi Law RAG System is ready to use!")
print("=" * 80)

# Display system statistics
get_system_stats()

print("\n🚀 USAGE INSTRUCTIONS:")
print("1. Use query_legal_system('your question') for full RAG queries")
print("2. Use get_relevant_documents('query') for document search only")
print("3. Use search_legal_documents('query') for formatted document search")
print("4. Use interactive_legal_query() for interactive mode")
print("5. Use get_system_stats() to view system information")

print("\n💡 Example usage:")
print("response = query_legal_system('What are the fundamental rights in Bangladesh?')")
print("print(response['answer'])")

print("\n🎯 Ready to answer your Bangladeshi law questions!")