In [None]:
#### STEP 1 :- Pdf related dependencies and imports ####
!pip install pymupdf pypdf pdfplumber --quiet
from google.colab import drive
import os
import glob
import fitz  # PyMuPDF
import pdfplumber
from pypdf import PdfReader
import traceback
from typing import Dict, Tuple

# Mount Google Drive
drive.mount('/content/drive')

def extract_text_pymupdf(pdf_path: str) -> Tuple[str, str]:
    """Extract text using PyMuPDF (fitz) - fastest but can fail on corrupted files"""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text, "pymupdf"
    except Exception as e:
        raise Exception(f"PyMuPDF failed: {str(e)}")

def extract_text_pdfplumber(pdf_path: str) -> Tuple[str, str]:
    """Extract text using pdfplumber - good for complex layouts"""
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text, "pdfplumber"
    except Exception as e:
        raise Exception(f"pdfplumber failed: {str(e)}")

def extract_text_pypdf(pdf_path: str) -> Tuple[str, str]:
    """Extract text using pypdf - most robust for corrupted files"""
    try:
        text = ""
        with open(pdf_path, 'rb') as file:
            reader = PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
        return text, "pypdf"
    except Exception as e:
        raise Exception(f"pypdf failed: {str(e)}")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m111.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.5/323.5 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m114.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


In [None]:
# Install remaining required packages
!pip install -q sentence-transformers faiss-cpu huggingface_hub llama-cpp-python gradio

print("✅ All packages installed")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
✅ All packages installed


In [None]:
#### STEP 2: Extract All PDFs ####

# Set your PDF folder path
pdf_folder = "/content/drive/MyDrive/lawing/lawing"

# Get all PDF files
pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
print(f"Found {len(pdf_files)} PDF files\n")

# Extract text from all PDFs with fallback methods
pdf_texts = {}
extraction_methods = [extract_text_pymupdf, extract_text_pdfplumber, extract_text_pypdf]

for pdf_path in pdf_files:
    filename = os.path.basename(pdf_path)
    print(f"Processing: {filename}")

    extracted = False
    for method in extraction_methods:
        try:
            text, method_name = method(pdf_path)
            if text and len(text.strip()) > 100:  # Valid extraction
                pdf_texts[filename] = text
                print(f"  ✅ Extracted using {method_name} ({len(text)} chars)")
                extracted = True
                break
        except Exception as e:
            continue

    if not extracted:
        print(f"  ❌ Failed to extract: {filename}")

print(f"\n{'='*60}")
print(f"✅ Successfully extracted {len(pdf_texts)} PDFs")
print(f"{'='*60}\n")

# Show extracted files
for filename in pdf_texts.keys():
    print(f"  📄 {filename}")

Found 44 PDF files

Processing: consumer.pdf
  ✅ Extracted using pymupdf (134487 chars)
Processing: the_information_technology_act,_2008.pdf
  ✅ Extracted using pymupdf (123947 chars)
Processing: tica.pdf
  ✅ Extracted using pymupdf (177280 chars)
Processing: ss.pdf
  ✅ Extracted using pymupdf (385547 chars)
Processing: LEGISLATIVE_DRAFTING.pdf
  ✅ Extracted using pymupdf (28290 chars)
Processing: tpa.pdf
  ✅ Extracted using pymupdf (161724 chars)
Processing: musdiv.pdf
  ✅ Extracted using pymupdf (10527 chars)
Processing: narco2.pdf
  ✅ Extracted using pymupdf (20972 chars)
Processing: bnns.pdf
  ✅ Extracted using pymupdf (815755 chars)
Processing: it_amendment_act2008.pdf
  ✅ Extracted using pymupdf (61603 chars)
Processing: bns1.pdf
  ✅ Extracted using pymupdf (396674 chars)
Processing: pa.pdf
  ✅ Extracted using pymupdf (26919 chars)
Processing: dt.pdf
  ✅ Extracted using pymupdf (5281 chars)
Processing: protection.pdf
  ✅ Extracted using pymupdf (55405 chars)
Processing: criminal_

In [None]:
#### STEP 3: Structure-Aware Chunking ####

import re
import numpy as np
from typing import List, Dict, Tuple

class LegalDocumentChunker:
    """Specialized chunker for Indian legal documents"""

    def __init__(self, chunk_size: int = 300, overlap: int = 50):
        self.chunk_size = chunk_size
        self.overlap = overlap

        # Patterns for legal structure
        self.section_pattern = re.compile(r'(?:Section|SECTION|Sec\.|SEC\.)\s*(\d+[A-Z]?)', re.IGNORECASE)
        self.article_pattern = re.compile(r'(?:Article|ARTICLE|Art\.|ART\.)\s*(\d+[A-Z]?)', re.IGNORECASE)
        self.chapter_pattern = re.compile(r'(?:Chapter|CHAPTER|CHAP\.)\s*([IVXLCDM]+|\d+)', re.IGNORECASE)

    def extract_act_name(self, text: str, filename: str) -> str:
        """Extract the name of the Act from document"""
        first_part = text[:500].upper()

        act_patterns = [
            r'(.*?ACT,?\s*\d{4})',
            r'THE\s+(.*?)\s+ACT',
            r'(.*?)\s+CODE'
        ]

        for pattern in act_patterns:
            match = re.search(pattern, first_part)
            if match:
                return match.group(1).strip()

        return filename.replace('.pdf', '').replace('_', ' ').title()

    def detect_section_boundaries(self, text: str) -> List[Tuple[int, str, str]]:
        """Detect section/article boundaries"""
        boundaries = []

        for match in self.section_pattern.finditer(text):
            boundaries.append((match.start(), 'Section', match.group(1)))

        for match in self.article_pattern.finditer(text):
            boundaries.append((match.start(), 'Article', match.group(1)))

        boundaries.sort(key=lambda x: x[0])
        return boundaries

    def clean_text(self, text: str) -> str:
        """Clean legal text while preserving structure"""
        text = re.sub(r'[ \t]+', ' ', text)
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
        text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
        text = re.sub(r'https?://\S+', '', text)
        return text.strip()

    def create_semantic_chunks(self, text: str, act_name: str, filename: str) -> List[Dict]:
        """Create chunks that respect legal document structure"""
        cleaned_text = self.clean_text(text)
        boundaries = self.detect_section_boundaries(cleaned_text)

        chunks = []

        if not boundaries:
            return self._fallback_chunking(cleaned_text, act_name, filename)

        # Chunk by section/article
        for i, (pos, marker_type, marker_num) in enumerate(boundaries):
            if i < len(boundaries) - 1:
                end_pos = boundaries[i + 1][0]
            else:
                end_pos = len(cleaned_text)

            section_text = cleaned_text[pos:end_pos].strip()

            # If section is too long, split it
            if len(section_text.split()) > self.chunk_size:
                sub_chunks = self._split_long_section(section_text, self.chunk_size, self.overlap)

                for j, sub_chunk in enumerate(sub_chunks):
                    chunks.append({
                        'text': sub_chunk,
                        'metadata': {
                            'source_file': filename,
                            'act_name': act_name,
                            'section_type': marker_type,
                            'section_number': marker_num,
                            'sub_chunk_id': j,
                            'chunk_type': 'section_split'
                        }
                    })
            else:
                chunks.append({
                    'text': section_text,
                    'metadata': {
                        'source_file': filename,
                        'act_name': act_name,
                        'section_type': marker_type,
                        'section_number': marker_num,
                        'chunk_type': 'full_section'
                    }
                })

        return chunks

    def _split_long_section(self, text: str, chunk_size: int, overlap: int) -> List[str]:
        """Split long section into smaller chunks"""
        words = text.split()
        chunks = []

        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if chunk.strip():
                chunks.append(chunk.strip())

            if i + chunk_size >= len(words):
                break

        return chunks

    def _fallback_chunking(self, text: str, act_name: str, filename: str) -> List[Dict]:
        """Fallback when no structure detected"""
        words = text.split()
        chunks = []

        for i in range(0, len(words), self.chunk_size - self.overlap):
            chunk_text = ' '.join(words[i:i + self.chunk_size])
            if chunk_text.strip():
                chunks.append({
                    'text': chunk_text.strip(),
                    'metadata': {
                        'source_file': filename,
                        'act_name': act_name,
                        'chunk_id': i // (self.chunk_size - self.overlap),
                        'chunk_type': 'sliding_window'
                    }
                })

            if i + self.chunk_size >= len(words):
                break

        return chunks

    def add_context_to_chunks(self, chunks: List[Dict]) -> List[Dict]:
        """Add metadata prefix to chunks for better retrieval"""
        for chunk in chunks:
            metadata = chunk['metadata']
            context_prefix = f"[{metadata['act_name']}] "

            if 'section_number' in metadata:
                context_prefix += f"{metadata['section_type']} {metadata['section_number']}: "

            chunk['enhanced_text'] = context_prefix + chunk['text']

        return chunks


# Process all PDFs with improved chunking
print("🔪 Chunking all PDFs with structure-aware method...")
print("="*60)

chunker = LegalDocumentChunker(chunk_size=300, overlap=50)

all_chunks = []
chunk_metadata = []

for filename, text in pdf_texts.items():
    print(f"\n📄 Processing: {filename}")

    act_name = chunker.extract_act_name(text, filename)
    print(f"  Act detected: {act_name}")

    doc_chunks = chunker.create_semantic_chunks(text, act_name, filename)
    doc_chunks = chunker.add_context_to_chunks(doc_chunks)

    for chunk in doc_chunks:
        all_chunks.append(chunk['enhanced_text'])
        chunk_metadata.append(chunk['metadata'])

    print(f"  Created {len(doc_chunks)} chunks")

print(f"\n{'='*60}")
print(f"✅ Total chunks created: {len(all_chunks)}")
print(f"📊 Average chunk length: {np.mean([len(c.split()) for c in all_chunks]):.1f} words")
print(f"{'='*60}")

🔪 Chunking all PDFs with structure-aware method...

📄 Processing: consumer.pdf
  Act detected: THE CONSUMER PROTECTION ACT, 2019
  Created 159 chunks

📄 Processing: the_information_technology_act,_2008.pdf
  Act detected: THE INFORMATION TECHNOLOGY ACT, 2008
  Created 135 chunks

📄 Processing: tica.pdf
  Act detected: THE INDIAN CONTRACT ACT, 1872
  Created 139 chunks

📄 Processing: ss.pdf
  Act detected: THE CODE ON SOCIAL SECURITY ACT, 2020
  Created 406 chunks

📄 Processing: LEGISLATIVE_DRAFTING.pdf
  Act detected: Legislative Drafting
  Created 18 chunks

📄 Processing: tpa.pdf
  Act detected: THE TRANSFER OF PROPERTY ACT, 1882
  Created 121 chunks

📄 Processing: musdiv.pdf
  Act detected: THE MUSLIM WOMEN (PROTECTION OF RIGHTS ON DIVORCE) ACT, 1986
  Created 6 chunks

📄 Processing: narco2.pdf
  Act detected: SUBSTANCES ACT, 1988
  Created 24 chunks

📄 Processing: bnns.pdf
  Act detected: Bnns
  Created 1113 chunks

📄 Processing: it_amendment_act2008.pdf
  Act detected: It Amendment

In [None]:
#### STEP 4: Create Embeddings ####

from sentence_transformers import SentenceTransformer
import faiss

print("🤖 Loading BAAI/bge-large-en-v1.5 embedding model...")
embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
print(f"✅ Model loaded (dimension: {embedding_model.get_sentence_embedding_dimension()})")

print(f"\n📊 Creating embeddings for {len(all_chunks)} chunks...")

# Create embeddings in batches
batch_size = 32
embeddings = []

for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i + batch_size]
    batch_embeddings = embedding_model.encode(batch, show_progress_bar=True)
    embeddings.extend(batch_embeddings)

    if (i // batch_size + 1) % 10 == 0:
        print(f"  Processed {i + len(batch)}/{len(all_chunks)} chunks")

embeddings = np.array(embeddings)
print(f"\n✅ Embeddings created: {embeddings.shape}")

🤖 Loading BAAI/bge-large-en-v1.5 embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

✅ Model loaded (dimension: 1024)

📊 Creating embeddings for 13433 chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 320/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 640/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 960/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 1280/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 1600/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 1920/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 2240/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 2560/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 2880/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 3200/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 3520/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 3840/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 4160/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 4480/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 4800/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 5120/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 5440/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 5760/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 6080/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 6400/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 6720/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 7040/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 7360/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 7680/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 8000/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 8320/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 8640/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 8960/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 9280/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 9600/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 9920/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 10240/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 10560/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 10880/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 11200/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 11520/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 11840/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 12160/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 12480/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 12800/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 13120/13433 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Processed 13433/13433 chunks

✅ Embeddings created: (13433, 1024)


In [None]:
#### STEP 5: Build FAISS Index ####
import pickle
print("🔨 Building FAISS index...")
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(embeddings)
index.add(embeddings)

print(f"✅ FAISS index built with {index.ntotal} vectors")

# Save everything
print("\n💾 Saving data...")
with open('/content/chunks_data.pkl', 'wb') as f:
    pickle.dump({
        'all_chunks': all_chunks,
        'chunk_metadata': chunk_metadata,
        'embeddings': embeddings
    }, f)

faiss.write_index(index, '/content/legal_faiss_index.index')
print("✅ Data saved successfully!")

🔨 Building FAISS index...
✅ FAISS index built with 13433 vectors

💾 Saving data...
✅ Data saved successfully!


In [None]:
#### STEP 6: Download & Load LLM (Optimized) ####

from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import torch
import shutil

# Create directory in Drive
drive_model_dir = "/content/drive/MyDrive/downloaded_models"
os.makedirs(drive_model_dir, exist_ok=True)

# Model settings
repo_id = "unsloth/gemma-3n-E4B-it-GGUF"
filename = "gemma-3n-E4B-it-Q4_K_M.gguf"
drive_model_path = os.path.join(drive_model_dir, filename)

# Check if already downloaded
if os.path.exists(drive_model_path):
    print(f"✅ Model found in Drive: {drive_model_path}")
    model_path = drive_model_path
else:
    print(f"📥 Downloading {filename}... (this will take several minutes)")
    try:
        temp_model_path = hf_hub_download(
            repo_id=repo_id,
            filename=filename,
            cache_dir="/content/temp_models"
        )

        print(f"✅ Download complete!")
        print(f"📁 Copying to Drive...")
        shutil.copy2(temp_model_path, drive_model_path)
        print(f"✅ Saved to Drive: {drive_model_path}")
        model_path = drive_model_path

    except Exception as e:
        print(f"❌ Error: {e}")
        model_path = None

# Load LLM with OPTIMIZATIONS for RAG
if model_path and os.path.exists(model_path):
    print(f"\n🚀 Loading Gemma with RAG-optimized settings...")

    llm = Llama(
        model_path=model_path,
        n_ctx=2048,              # Reduced from 4096 - FASTER!
        n_gpu_layers=-1 if torch.cuda.is_available() else 0,
        n_threads=4,
        n_batch=256,             # Faster first token
        use_mlock=True,
        use_mmap=True,
        f16_kv=True,
        verbose=False,
        seed=42
    )

    print("✅ Gemma loaded successfully!")

    if torch.cuda.is_available():
        print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
        print(f"📊 VRAM: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    else:
        print("⚠️ Running on CPU (slower)")

    # Quick test
    print("\n🧪 Testing model...")
    test_response = llm("What is law?", max_tokens=10, temperature=0.1)
    print(f"✅ Test passed: {test_response['choices'][0]['text'][:50]}...")

else:
    print("❌ Model not available")

✅ Model found in Drive: /content/drive/MyDrive/downloaded_models/gemma-3n-E4B-it-Q4_K_M.gguf

🚀 Loading Gemma with RAG-optimized settings...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
llama_kv_cache_unified_iswa: using full-size SWA cache (ref: https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)


✅ Gemma loaded successfully!
🎮 GPU: Tesla T4
📊 VRAM: 1.35 GB

🧪 Testing model...
✅ Test passed: 

Law is a complex concept with many definitions....


In [None]:
#### STEP 7: Load Data & Create Retriever ####

print("📦 Loading saved data...")

# Load chunks and embeddings
with open('/content/chunks_data.pkl', 'rb') as f:
    data = pickle.load(f)
    all_chunks = data['all_chunks']
    chunk_metadata = data['chunk_metadata']
    embeddings = data['embeddings']

# Load FAISS index
index = faiss.read_index('/content/legal_faiss_index.index')

print(f"✅ Loaded {len(all_chunks)} chunks")
print(f"✅ Loaded FAISS index with {index.ntotal} vectors")

# Reload embedding model
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
print("✅ Embedding model reloaded")

📦 Loading saved data...
✅ Loaded 13433 chunks
✅ Loaded FAISS index with 13433 vectors
✅ Embedding model reloaded


In [None]:
#### STEP 8: Create Optimized Retriever ####

class OptimizedLegalRetriever:
    """Fast retriever with query enhancement and reranking"""

    def __init__(self, embedding_model, faiss_index, all_chunks, chunk_metadata):
        self.embedding_model = embedding_model
        self.index = faiss_index
        self.all_chunks = all_chunks
        self.chunk_metadata = chunk_metadata
        # Important: BGE models work better with instruction prefix
        self.query_prefix = "Represent this query for retrieving relevant legal passages: "

    def preprocess_query(self, query: str) -> str:
        """Expand common legal abbreviations for better matching"""
        expansions = {
            r'\bBNS\b': 'Bharatiya Nyaya Sanhita',
            r'\bBNNS\b': 'Bharatiya Nagarik Suraksha Sanhita',
            r'\bCrPC\b': 'Criminal Procedure Code',
            r'\bPOCSO\b': 'Protection of Children from Sexual Offences',
            r'\bIT Act\b': 'Information Technology Act',
            r'\bSec\.?\b': 'Section',
            r'\bArt\.?\b': 'Article',
        }

        for abbr, full in expansions.items():
            query = re.sub(abbr, full, query, flags=re.IGNORECASE)

        return query.strip()

    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
        """Retrieve most relevant chunks with hybrid scoring"""
        # Step 1: Preprocess query
        processed_query = self.preprocess_query(query)

        # Step 2: Add instruction prefix for BGE model
        query_with_instruction = self.query_prefix + processed_query

        # Step 3: Embed query
        query_embedding = self.embedding_model.encode([query_with_instruction])
        faiss.normalize_L2(query_embedding)

        # Step 4: Retrieve more candidates for reranking
        scores, indices = self.index.search(query_embedding, top_k * 2)

        # Step 5: Build candidates list
        candidates = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.all_chunks):
                candidates.append({
                    'chunk': self.all_chunks[idx],
                    'metadata': self.chunk_metadata[idx],
                    'score': float(score),
                    'index': int(idx)
                })

        # Step 6: Keyword boosting (simple reranking)
        query_keywords = set(processed_query.lower().split())
        for candidate in candidates:
            chunk_keywords = set(candidate['chunk'].lower().split())
            overlap = len(query_keywords & chunk_keywords) / max(len(query_keywords), 1)
            candidate['score'] += overlap * 0.15  # Boost score by keyword match

        # Step 7: Sort by final score and return top_k
        candidates.sort(key=lambda x: x['score'], reverse=True)
        return candidates[:top_k]

    def format_context(self, chunks_data: List[Dict]) -> str:
        """Format retrieved chunks for LLM prompt"""
        formatted_parts = []

        for i, data in enumerate(chunks_data, 1):
            meta = data['metadata']
            chunk = data['chunk']

            # Get source file name
            source = meta.get('source_file', 'Unknown').replace('.pdf', '')

            # Add section info if available
            section_info = ""
            if 'section_number' in meta:
                section_info = f" - {meta.get('section_type', 'Section')} {meta['section_number']}"

            formatted_parts.append(f"[Reference {i}] {source}{section_info}\n{chunk}")

        return "\n\n---\n\n".join(formatted_parts)


# Initialize retriever
print("🎯 Initializing retriever...")
retriever = OptimizedLegalRetriever(
    embedding_model=embedding_model,
    faiss_index=index,
    all_chunks=all_chunks,
    chunk_metadata=chunk_metadata
)
print("✅ Retriever ready!")

# Quick test
print("\n🧪 Testing retrieval...")
test_query = "What is the punishment for mass murder?"
test_results = retriever.retrieve(test_query, top_k=3)

print(f"\nQuery: '{test_query}'")
print(f"Retrieved {len(test_results)} chunks:\n")
for i, result in enumerate(test_results, 1):
    print(f"{i}. Source: {result['metadata']['source_file']}")
    print(f"   Score: {result['score']:.3f}")
    print(f"   Preview: {result['chunk'][:100]}...\n")

🎯 Initializing retriever...
✅ Retriever ready!

🧪 Testing retrieval...

Query: 'What is the punishment for mass murder?'
Retrieved 3 chunks:

1. Source: bnns.pdf
   Score: 0.722
   Preview: [Bnns] Section 407: of which is hereunto annexed) the punishment adjudged by the said sentence has b...

2. Source: bnns.pdf
   Score: 0.720
   Preview: [Bnns] Section 407: section 407) To the Officer in charge of the Jail at...............................

3. Source: ndpsact.pdf
   Score: 0.713
   Preview: [Ndpsact] Section 2B: Sec.2B)
10.
Punishment for abetment and crimInal
As provided for that particul...



In [None]:
#### Diagnostic Test ####

print("🔍 Diagnosing generation speed issue...\n")

# Test 1: Minimal generation
print("TEST 1: Minimal prompt (should be fast)")
minimal_test = llm(
    "What is law?",
    max_tokens=20,
    temperature=0.1,
    stop=["</s>"]
)
print(f"✅ Generated 20 tokens")
print(f"Tokens generated: {minimal_test['usage']['completion_tokens']}")
print(f"Text: {minimal_test['choices'][0]['text']}\n")

# Test 2: Check actual token generation
print("TEST 2: Check if it's generating more than max_tokens")
import time
start = time.time()
test_response = llm(
    "<start_of_turn>user\nWhat is theft?<end_of_turn>\n<start_of_turn>model\n",
    max_tokens=50,
    temperature=0.1,
    stop=["</s>", "<end_of_turn>"]
)
elapsed = time.time() - start

print(f"Requested: 50 tokens")
print(f"Actually generated: {test_response['usage']['completion_tokens']} tokens")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {test_response['usage']['completion_tokens'] / elapsed:.1f} tokens/sec")
print(f"Text: {test_response['choices'][0]['text'][:100]}...")

🔍 Diagnosing generation speed issue...

TEST 1: Minimal prompt (should be fast)
✅ Generated 20 tokens
Tokens generated: 20
Text: 

Law is a complex concept with many definitions. However, at its core, **law is a

TEST 2: Check if it's generating more than max_tokens
Requested: 50 tokens
Actually generated: 50 tokens
Time: 21.50s
Speed: 2.3 tokens/sec
Text: Theft is a broad term referring to the act of taking someone else's property without their permissio...


In [None]:
#### STEP 11: Reload LLM with Better GPU Utilization ####

print("🔄 Reloading LLM with optimized GPU settings...")

# Clear current model from memory
del llm
import gc
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("✅ Memory cleared")

# Reload with better settings
model_path = "/content/drive/MyDrive/downloaded_models/gemma-3n-E4B-it-Q4_K_M.gguf"

llm = Llama(
    model_path=model_path,
    n_ctx=1536,              # Even smaller context - FASTER
    n_gpu_layers=35,         # Explicitly set layer count (instead of -1)
    n_threads=2,             # Fewer threads = better GPU utilization
    n_batch=128,             # Smaller batch = faster
    use_mlock=False,         # Don't lock in RAM - let GPU handle it
    use_mmap=True,
    f16_kv=True,
    verbose=True,            # Show what's happening
    seed=42
)

print("\n✅ LLM reloaded!")

# Test speed again
print("\n🧪 Testing new speed...")
import time
start = time.time()
test = llm(
    "<start_of_turn>user\nWhat is law?<end_of_turn>\n<start_of_turn>model\n",
    max_tokens=50,
    temperature=0.1,
    stop=["</s>", "<end_of_turn>"]
)
elapsed = time.time() - start

tokens_generated = test['usage']['completion_tokens']
speed = tokens_generated / elapsed

print(f"\n{'='*60}")
print(f"Tokens generated: {tokens_generated}")
print(f"Time: {elapsed:.2f}s")
print(f"Speed: {speed:.1f} tokens/sec")
print(f"{'='*60}")

if speed > 10:
    print("✅ MUCH BETTER! Speed improved significantly!")
elif speed > 5:
    print("⚠️ Improved, but still not optimal for T4")
else:
    print("❌ Still slow - T4 GPU might be throttled in Colab")
    print("💡 Recommendation: Use smaller model or upgrade to A100 runtime")

🔄 Reloading LLM with optimized GPU settings...
✅ Memory cleared


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
load: control token: 258498 '<unused2596>' is not marked as EOG
load: control token: 258497 '<unused2595>' is not marked as EOG
load: control token: 258496 '<unused2594>' is not marked as EOG
load: control token: 258495 '<unused2593>' is not marked as EOG
load: control token: 258493 '<unused2591>' is not marked as EOG
load: control token: 258492 '<unused2590>' is not marked as EOG
load: control token: 258491 '<unused2589>' is not marked as EOG
load: control token: 258490 '<unused2588>' is not marked as EOG
load: control token: 258489 '<unused2587>' is not marked as EOG
load: control token: 258486 '<unused2584>' is not marked as EOG
load: control token: 258485 '<unused2583>' is not marked as EOG
load: control token: 258483 '<unused2581>' is not marked as EOG
load: control token: 258478 '<unused2576>' is not marked as EOG
load: control token: 258477 '<unused2575>' is not marked as EOG
load: control token: 258474 '<unused257


✅ LLM reloaded!

🧪 Testing new speed...


llama_perf_context_print:        load time =    4214.53 ms
llama_perf_context_print: prompt eval time =    4214.35 ms /    13 tokens (  324.18 ms per token,     3.08 tokens per second)
llama_perf_context_print:        eval time =   34431.26 ms /    49 runs   (  702.68 ms per token,     1.42 tokens per second)
llama_perf_context_print:       total time =   38823.53 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47



Tokens generated: 50
Time: 38.83s
Speed: 1.3 tokens/sec
❌ Still slow - T4 GPU might be throttled in Colab
💡 Recommendation: Use smaller model or upgrade to A100 runtime


In [None]:
#### STEP 12: Production Chatbot - Pure RAG, Accurate & Sufficient #### # class AccurateLegalChatbot: #     """Pure RAG chatbot - no shortcuts, accurate answers only""" #     def __init__(self, llm, retriever): #         self.llm = llm #         self.retriever = retriever #         # Balanced params: accuracy + reasonable speed #         self.gen_params = { #             'max_tokens': 300,       # Enough for sufficient answers #             'temperature': 0.1,      # Low = more factual #             'top_p': 0.9, #             'repeat_penalty': 1.15, #             'stop': ["</s>", "<end_of_turn>", "\n\nQuestion:", "\n\nUser:"] #         } #     def answer(self, query: str, verbose: bool = True) -> Dict: #         """Generate accurate, sufficient answer from legal documents""" #         start_time = time.time() #         if verbose: #             print(f"\n{'='*70}") #             print(f"Query: {query}") #             print(f"{'='*70}") #             print("🔍 Searching legal documents...") #         # Retrieve relevant chunks #         retrieval_start = time.time() #         chunks_data = self.retriever.retrieve(query, top_k=3) #         retrieval_time = time.time() - retrieval_start #         if verbose: #             print(f"✅ Retrieved {len(chunks_data)} relevant chunks ({retrieval_time:.2f}s)") #             for i, chunk in enumerate(chunks_data, 1): #                 print(f"   {i}. {chunk['metadata']['source_file']} (score: {chunk['score']:.3f})") #         # Check retrieval quality #         if not chunks_data: #             return { #                 'answer': "I couldn't find any relevant information in the legal documents for your query. Please try rephrasing or providing more specific details about the act or section you're asking about.", #                 'sources': [], #                 'retrieval_time': retrieval_time, #                 'generation_time': 0, #                 'total_time': time.time() - start_time, #                 'confidence': 'none' #             } #         avg_score = sum(c['score'] for c in chunks_data) / len(chunks_data) #         if avg_score < 0.4: #             if verbose: #                 print(f"⚠️ Low relevance score ({avg_score:.2f}) - answer may not be accurate") #         # Format context with trimming for speed #         context_parts = [] #         for i, data in enumerate(chunks_data, 1): #             meta = data['metadata'] #             chunk = data['chunk'] #             # Trim very long chunks but keep sufficient content #             words = chunk.split() #             if len(words) > 250: #                 chunk = ' '.join(words[:250]) + "..." #             source = meta.get('source_file', 'Unknown').replace('.pdf', '') #             section_info = "" #             if 'section_number' in meta: #                 section_info = f" - {meta.get('section_type', 'Section')} {meta['section_number']}" #             context_parts.append(f"[Reference {i}: {source}{section_info}]\n{chunk}") #         context = "\n\n---\n\n".join(context_parts) #         # Create prompt - clear instructions for accuracy #         prompt = f"""<start_of_turn>system # You are an expert on Indian law. Provide accurate, complete answers based ONLY on the legal references provided. # CRITICAL RULES: # 1. Answer must be based exclusively on the provided references # 2. Cite specific sections, acts, and provisions # 3. If the references don't fully answer the question, state what's missing # 4. Be thorough but concise - include all relevant details # 5. Never invent or assume information not in the references<end_of_turn> # <start_of_turn>user # Legal References: # {context} # Question: {query} # Provide a complete and accurate answer based on the references above. Cite specific sections and acts.<end_of_turn> # <start_of_turn>model # """ #         if verbose: #             print(f"📝 Context size: {len(context.split())} words") #             print(f"🤖 Generating answer... (this will take ~30-60 seconds on T4 GPU)") #         # Generate answer #         generation_start = time.time() #         try: #             response = self.llm(prompt, **self.gen_params) #             answer = response['choices'][0]['text'].strip() #             generation_time = time.time() - generation_start #             # Clean up answer #             answer = re.sub(r'<start_of_turn>.*?<end_of_turn>', '', answer, flags=re.DOTALL) #             answer = re.sub(r'<[^>]+>', '', answer) #             answer = answer.strip() #             if verbose: #                 print(f"✅ Answer generated ({generation_time:.1f}s)") #         except Exception as e: #             return { #                 'answer': f"Error generating answer: {str(e)}", #                 'sources': [], #                 'retrieval_time': retrieval_time, #                 'generation_time': 0, #                 'total_time': time.time() - start_time, #                 'confidence': 'error' #             } #         # Extract unique sources #         sources = [] #         seen = set() #         for data in chunks_data: #             source = data['metadata'].get('source_file', 'Unknown').replace('.pdf', '') #             if source not in seen: #                 sources.append(source) #                 seen.add(source) #         total_time = time.time() - start_time #         # Confidence based on retrieval scores #         if avg_score > 0.6: #             confidence = 'high' #         elif avg_score > 0.4: #             confidence = 'medium' #         else: #             confidence = 'low' #         return { #             'answer': answer, #             'sources': sources, #             'retrieval_time': retrieval_time, #             'generation_time': generation_time, #             'total_time': total_time, #             'confidence': confidence, #             'avg_relevance_score': avg_score #         } #     def chat_loop(self): #         """Interactive chat session""" #         print("\n" + "="*70) #         print("🏛️  INDIAN LEGAL AI ASSISTANT") #         print("="*70) #         print("Ask questions about Indian legal documents.") #         print("Type 'exit' or 'quit' to end the session.\n") #         while True: #             try: #                 user_input = input("\n📝 Your Question: ").strip() #                 if not user_input: #                     continue #                 if user_input.lower() in ['exit', 'quit', 'q']: #                     print("\n👋 Thank you for using the Legal Assistant. Goodbye!\n") #                     break #                 # Get answer #                 result = self.answer(user_input, verbose=True) #                 # Display answer #                 print(f"\n{'='*70}") #                 print("📖 ANSWER:") #                 print(f"{'='*70}") #                 print(result['answer']) #                 if result['sources']: #                     print(f"\n📚 SOURCES:") #                     for i, source in enumerate(result['sources'], 1): #                         print(f"  {i}. {source}") #                 print(f"\n📊 METADATA:") #                 print(f"  • Retrieval Time: {result['retrieval_time']:.2f}s") #                 print(f"  • Generation Time: {result['generation_time']:.1f}s") #                 print(f"  • Total Time: {result['total_time']:.1f}s") #                 print(f"  • Confidence: {result['confidence'].upper()}") #                 print(f"  • Relevance Score: {result.get('avg_relevance_score', 0):.2f}") #                 print("="*70) #             except KeyboardInterrupt: #                 print("\n\n👋 Goodbye!\n") #                 break #             except Exception as e: #                 print(f"\n❌ Error: {e}\n") # # Initialize production chatbot # print("🎯 Initializing accurate legal chatbot...") # chatbot = AccurateLegalChatbot(llm=llm, retriever=retriever) # print("✅ Chatbot ready!") # # Run comprehensive test # print("\n" + "="*70) # print("🧪 COMPREHENSIVE TEST") # print("="*70) # test_queries = [ #     "What is the punishment for child abuse under POCSO Act?", #     "Explain Section 27 of NDPS Act", #     "What are the provisions for cybercrime in IT Act?" # ] # for query in test_queries: #     result = chatbot.answer(query, verbose=True) #     print(f"\n📖 Answer: {result['answer'][:200]}...") #     print(f"⏱️ Time: {result['total_time']:.1f}s | Confidence: {result['confidence']}") #     print("\n" + "-"*70) # print("\n✅ Tests complete!") # print("\n💡 To start interactive chat, run: chatbot.chat_loop()")

In [None]:
#### STEP 13: Smart Chatbot with Intent Detection & Status Bubbles ####

import gradio as gr
import time
import re

def detect_intent(query: str) -> str:
    """Detect if query is conversational or legal"""
    query_lower = query.lower().strip()

    # Legal keywords and patterns
    legal_indicators = [
        # Acts and laws
        'section', 'act', 'law', 'ipc', 'bns', 'bnns', 'pocso', 'ndps', 'crpc',
        'it act', 'rules','indian penal code', 'constitution',
        # Legal terms
        'punishment', 'penalty', 'offense', 'offence', 'crime', 'provision',
        'article', 'code', 'regulation', 'statute', 'judgment', 'court',
        'legal', 'rights', 'bail', 'arrest', 'trial', 'sentence',
        # Question patterns about law
        'what is the punishment', 'what are the provisions', 'explain section',
        'define', 'meaning of', 'interpretation'
    ]

    # Check for legal indicators
    for indicator in legal_indicators:
        if indicator in query_lower:
            return "LEGAL"

    # Check for section/article numbers
    if re.search(r'\b(section|sec|article|art)\s*\d+', query_lower):
        return "LEGAL"

    # Check for "what is X" where X might be legal term
    if re.match(r'what (is|are|does|means?)\s+\w+', query_lower):
        words = query_lower.split()
        # If query is longer than 4 words, likely legal
        if len(words) > 4:
            return "LEGAL"

    # Short queries or greetings = conversational
    if len(query.split()) <= 3:
        return "CONVERSATIONAL"

    # Default to conversational for unclear cases
    return "CONVERSATIONAL"


def conversational_response(query: str):
    """Handle conversational queries with LLM directly (no RAG)"""

    # Show thinking bubble
    yield "💭 **Thinking...**"

    # Simple conversational prompt
    prompt = f"""<start_of_turn>system
You are a helpful Indian legal AI assistant. Respond conversationally and helpfully in 2-3 sentences.<end_of_turn>
<start_of_turn>user
{query}<end_of_turn>
<start_of_turn>model
"""

    try:
        # Generate with streaming
        response_stream = chatbot.llm(
            prompt,
            max_tokens=150,
            temperature=0.7,
            top_p=0.9,
            repeat_penalty=1.1,
            stream=True,
            stop=["</s>", "<end_of_turn>", "\n\nUser:"]
        )

        full_response = ""
        token_count = 0

        for chunk in response_stream:
            token = chunk['choices'][0]['text']
            full_response += token
            token_count += 1

            # Update every 3 tokens
            if token_count % 3 == 0:
                clean_response = re.sub(r'<[^>]+>', '', full_response).strip()
                yield clean_response

        # Final cleanup
        full_response = re.sub(r'<[^>]+>', '', full_response).strip()
        yield full_response

    except Exception as e:
        yield f"I apologize, I encountered an error: {str(e)}"


def legal_rag_response(query: str):
    """Handle legal queries with full RAG pipeline"""

    # Status 1: Retrieving
    yield "🔍 **Retrieving relevant legal documents...**"

    retrieval_start = time.time()
    chunks_data = chatbot.retriever.retrieve(query, top_k=3)
    retrieval_time = time.time() - retrieval_start

    if not chunks_data or all(c['score'] < 0.3 for c in chunks_data):
        yield "❌ **No relevant legal documents found.**\n\nI couldn't find information about this in the available legal documents. Please try:\n- Rephrasing your question\n- Being more specific about the act or section\n- Asking about a different legal topic"
        return

    # Status 2: Show retrieved sources
    avg_score = sum(c['score'] for c in chunks_data) / len(chunks_data)

    sources_status = f"✅ **Retrieved {len(chunks_data)} relevant sections** ({retrieval_time:.2f}s)\n\n"
    sources_status += "📚 **Sources Found:**\n"
    for i, chunk in enumerate(chunks_data, 1):
        source = chunk['metadata']['source_file'].replace('.pdf', '')
        score = chunk['score']
        sources_status += f"  • {source} (Relevance: {score:.0%})\n"

    sources_status += "\n---\n\n"
    yield sources_status

    time.sleep(0.3)  # Brief pause for visual feedback

    # Status 3: Thinking/Analyzing
    yield sources_status + "💭 **Analyzing legal provisions...**"

    # Format context
    context_parts = []
    for i, data in enumerate(chunks_data, 1):
        meta = data['metadata']
        chunk = data['chunk']

        words = chunk.split()
        if len(words) > 250:
            chunk = ' '.join(words[:250]) + "..."

        source = meta.get('source_file', 'Unknown').replace('.pdf', '')
        section_info = ""
        if 'section_number' in meta:
            section_info = f" - {meta.get('section_type', 'Section')} {meta['section_number']}"

        context_parts.append(f"[Reference {i}: {source}{section_info}]\n{chunk}")

    context = "\n\n---\n\n".join(context_parts)

    # Create prompt
    prompt = f"""<start_of_turn>system
You are an expert on Indian law. Provide accurate, complete answers based ONLY on the legal references provided.

CRITICAL RULES:
1. Answer must be based exclusively on the provided references
2. Cite specific sections, acts, and provisions
3. Be thorough but concise
4. If the references don't fully answer the question, state what's missing<end_of_turn>
<start_of_turn>user
Legal References:
{context}

Question: {query}

Provide a complete and accurate answer based on the references above.<end_of_turn>
<start_of_turn>model
"""

    time.sleep(0.3)

    # Status 4: Generating answer (with streaming)
    yield sources_status + "✍️ **Generating answer...**\n\n---\n\n"

    generation_start = time.time()

    try:
        response_stream = chatbot.llm(
            prompt,
            max_tokens=300,
            temperature=0.1,
            top_p=0.9,
            repeat_penalty=1.15,
            stream=True,
            stop=["</s>", "<end_of_turn>", "\n\nQuestion:", "\n\nUser:"]
        )

        full_answer = ""
        token_count = 0

        # Stream the answer
        for chunk in response_stream:
            token = chunk['choices'][0]['text']
            full_answer += token
            token_count += 1

            if token_count % 3 == 0:
                clean_answer = re.sub(r'<[^>]+>', '', full_answer).strip()
                current_response = sources_status + "📖 **Answer:**\n\n" + clean_answer
                yield current_response

        generation_time = time.time() - generation_start

        # Final cleanup
        full_answer = re.sub(r'<start_of_turn>.*?<end_of_turn>', '', full_answer, flags=re.DOTALL)
        full_answer = re.sub(r'<[^>]+>', '', full_answer).strip()

        # Final response with metadata
        final_response = sources_status + f"📖 **Answer:**\n\n{full_answer}\n\n"
        final_response += "---\n\n"
        final_response += f"*⏱️ {generation_time:.1f}s | "
        final_response += f"🎯 Confidence: {('HIGH' if avg_score > 0.6 else 'MEDIUM' if avg_score > 0.4 else 'LOW')} | "
        final_response += f"📊 Relevance: {avg_score:.2f}*"

        yield final_response

    except Exception as e:
        error_msg = sources_status + f"\n\n❌ **Error:** {str(e)}"
        yield error_msg


def smart_chat(message: str, history: list):
    """Smart chat with intent detection"""

    # Detect intent
    intent = detect_intent(message)

    if intent == "CONVERSATIONAL":
        # Use LLM directly (no RAG)
        yield from conversational_response(message)
    else:
        # Use full RAG pipeline
        yield from legal_rag_response(message)


# Create Gradio interface (compatible version)
print("🚀 Creating smart chatbot interface...")

demo = gr.ChatInterface(
    fn=smart_chat,
    title="🏛️ Indian Legal AI Assistant",
    description="Ask questions about Indian legal documents (POCSO, NDPS, IT Act, BNS, BNNS, and 40+ other acts)",
    examples=[
        "Hello, how can you help me?",
        "What does POCSO stand for?",
        "What is the punishment for child abuse under POCSO Act?",
        "Explain Section 27 of NDPS Act",
        "What are the provisions for cybercrime in IT Act?",
        "Thank you for your help!"
    ],
    theme="soft"
)

print("✅ Smart chatbot ready!")
print("\n" + "="*70)
print("🎉 LAUNCHING SMART CHATBOT")
print("="*70)
print("\n✨ FEATURES:")
print("  ✅ Intent detection (conversational vs legal)")
print("  ✅ Direct LLM for simple questions")
print("  ✅ Full RAG for legal queries")
print("  ✅ Status bubbles (retrieving → thinking → generating)")
print("  ✅ Real-time streaming")
print("\n🌐 Launching...")
print("="*70 + "\n")

demo.launch(share=True, debug=True, show_error=True)

🚀 Creating smart chatbot interface...
✅ Smart chatbot ready!

🎉 LAUNCHING SMART CHATBOT

✨ FEATURES:
  ✅ Intent detection (conversational vs legal)
  ✅ Direct LLM for simple questions
  ✅ Full RAG for legal queries
  ✅ Status bubbles (retrieving → thinking → generating)
  ✅ Real-time streaming

🌐 Launching...



  self.chatbot = Chatbot(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://4b6d4041b039bd7fa7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Llama.generate: 6 prefix-match hit, remaining 33 prompt tokens to eval
llama_perf_context_print:        load time =    4214.53 ms
llama_perf_context_print: prompt eval time =  174231.66 ms /   673 tokens (  258.89 ms per token,     3.86 tokens per second)
llama_perf_context_print:        eval time =   43342.41 ms /   119 runs   (  364.22 ms per token,     2.75 tokens per second)
llama_perf_context_print:       total time =   48183.81 ms /   792 tokens
llama_perf_context_print:    graphs reused =        115
Llama.generate: 31 prefix-match hit, remaining 8 prompt tokens to eval
llama_perf_context_print:        load time =    4214.53 ms
llama_perf_context_print: prompt eval time =    1997.06 ms /     8 tokens (  249.63 ms per token,     4.01 tokens per second)
llama_perf_context_print:        eval time =   38130.22 ms /   106 runs   (  359.72 ms per token,     2.78 tokens per second)
llama_perf_context_print:       total time =   40848.02 ms /   114 tokens
llama_perf_context_print:    gra

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://4b6d4041b039bd7fa7.gradio.live


