In [6]:
import json
import re
from typing import List, Dict
import os # Make sure this is at the top of your file


MAX_CHUNK_LENGTH = 2000
OVERLAP = 100

chapter_pattern = re.compile(r'^CHAPTER\s+[IVXLCDM]+\s+.*', re.IGNORECASE)
section_pattern = re.compile(r'^\s*(\d{1,3}[A-Z]?)\.\s+(.+)$')
subsection_pattern = re.compile(r'^\s*\(\d+\)|^\s*[a-z]\)')
amendment_note_pattern = re.compile(r'^(Subs\.|Ins\.|Omitted|Amend\.|Rep\.)', re.IGNORECASE)

def split_with_overlap(text: str, max_len: int = MAX_CHUNK_LENGTH, overlap: int = OVERLAP) -> List[str]:
    if len(text) <= max_len:
        return [text]
    parts = []
    start = 0
    while start < len(text):
        end = min(start + max_len, len(text))
        parts.append(text[start:end])
        if end == len(text):
            break
        start = end - overlap if end - overlap > start else end
    return parts


def split_into_subsections(section_text: str) -> List[str]:
    """Split section into subsections if subsection markers exist."""
    lines = section_text.split("\n")
    subsections = []
    current = []
    for line in lines:
        if subsection_pattern.match(line) and current:
            subsections.append("\n".join(current).strip())
            current = [line]
        else:
            current.append(line)
    if current:
        subsections.append("\n".join(current).strip())
    return subsections

def create_law_chunks(file_path: str, law_name: str, doc_id: str) -> List[Dict]:
    chunks = []
    current_section_number = None
    current_section_title = None
    current_chapter_heading = None
    current_body = []

    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        for raw_line in f:
            line = raw_line.strip()
            if not line:
                continue

            if chapter_pattern.match(line):
                current_chapter_heading = line
                continue

            section_match = section_pattern.match(line)
            if section_match and not amendment_note_pattern.match(section_match.group(2)):
                # save previous section
                if current_section_number and current_body:
                    section_text = "\n".join(current_body)
                    subsections = split_into_subsections(section_text)
                    for sub_idx, sub in enumerate(subsections):
                        split_parts = split_with_overlap(sub)
                        for idx, part in enumerate(split_parts):
                            chunk_id = f"{doc_id}_{current_section_number}_{sub_idx}_{idx}"
                            chunks.append({
                                "chunk_id": chunk_id,
                                "doc_id": doc_id,
                                "text": part,
                                "metadata": {
                                    "chapter": current_chapter_heading,
                                    "section_number": current_section_number,
                                    "section_title": current_section_title,
                                    "law_name": law_name
                                }
                            })

                # start new section
                current_section_number = section_match.group(1).strip()
                current_section_title = section_match.group(2).strip()
                current_body = [line]
            else:
                if current_section_number:
                    current_body.append(line)

    # save last section
    if current_section_number and current_body:
        section_text = "\n".join(current_body)
        subsections = split_into_subsections(section_text)
        for sub_idx, sub in enumerate(subsections):
            split_parts = split_with_overlap(sub)
            for idx, part in enumerate(split_parts):
                chunk_id = f"{doc_id}_{current_section_number}_{sub_idx}_{idx}"
                chunks.append({
                    "chunk_id": chunk_id,
                    "doc_id": doc_id,
                    "text": part,
                    "metadata": {
                        "chapter": current_chapter_heading,
                        "section_number": current_section_number,
                        "section_title": current_section_title,
                        "law_name": law_name
                    }
                })

    return chunks
laws = [
    {"input_file": "BNS.txt", "law_name": "BNS", "doc_id": "bns_2023"},
    {"input_file": "BNSS.txt", "law_name": "BNSS", "doc_id": "bnss_2023"},
    {"input_file": "BSA.txt", "law_name": "BSA", "doc_id": "bsa_2023"},
    {"input_file": "CrPC.txt", "law_name": "CrPC", "doc_id": "crpc_1973"},
    {"input_file": "IEA.txt", "law_name": "IEA", "doc_id": "iea_1872"},
    {"input_file": "IPC.txt", "law_name": "IPC", "doc_id": "ipc_1860"},
]
# Create the 'data' directory if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')

for law in laws:
    input_path = os.path.join("data", law["input_file"])
    chunks = create_law_chunks(input_path, law["law_name"], law["doc_id"])
    
    out_file = f"{law['law_name'].lower()}.json"
    output_path = os.path.join("data", out_file)
    
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)
    print(f"✅ {law['law_name']} → {output_path} ({len(chunks)} chunks)")

✅ BNS → data/bns.json (450 chunks)
✅ BNSS → data/bnss.json (1176 chunks)
✅ BSA → data/bsa.json (164 chunks)
✅ CrPC → data/crpc.json (1946 chunks)
✅ IEA → data/iea.json (467 chunks)
✅ IPC → data/ipc.json (1271 chunks)


In [15]:
import os
import re
import json
import pdfplumber

# =========================
# Setup paths
# =========================
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(BASE_DIR, "data")

print("Looking for PDFs in:", DATA_DIR)
if os.path.exists(DATA_DIR):
    print("Files found:", os.listdir(DATA_DIR))
else:
    print("⚠️ DATA_DIR does not exist")

# =========================
# Main processing
# =========================
def split_subject_summary(text):
    """Split text into Subject and Summary based on first period."""
    if not text:
        return "", ""
    text = text.strip()
    # Split by first period
    parts = text.split(".", 1)
    subject = parts[0].strip()
    summary = parts[1].strip() if len(parts) > 1 else ""
    return subject, summary

def process_mappings():
    all_chunks = []

    pdf_files = [f for f in os.listdir(DATA_DIR) if f.lower().endswith(".pdf")]
    if not pdf_files:
        print(f"⚠️ No PDF files found in {DATA_DIR}")
        return

    for file_name in pdf_files:
        file_path = os.path.join(DATA_DIR, file_name)

        try:
            new_law_prefix, old_law_prefix = os.path.splitext(file_name)[0].split("_to_")
        except ValueError:
            new_law_prefix = "NEW"
            old_law_prefix = "OLD"

        with pdfplumber.open(file_path) as pdf:
            prev_subject = ""
            prev_old_section = ""
            prev_summary = ""

            for page in pdf.pages:
                tables = page.extract_tables()
                for table in tables:
                    for row in table:
                        if not row or not row[0]:
                            continue

                        first_cell = str(row[0]).strip()
                        if not re.match(r'^\s*\d+', first_cell):
                            # continuation row → append to previous summary
                            extra_text = " ".join([str(c).strip() for c in row[1:] if c]).strip()
                            if extra_text and all_chunks:
                                all_chunks[-1]['fields']['Summary_of_comparison'] += " " + extra_text
                            continue

                        new_section = first_cell
                        remaining_text = [str(c).strip() for c in row[1:] if c and str(c).strip()]

                        # Combine remaining text as one string
                        combined_text = " ".join(remaining_text)

                        # Split into subject and summary
                        subject, summary = split_subject_summary(combined_text)

                        old_section = ""
                        # Detect first numeric in remaining_text as Old Section
                        for cell in remaining_text:
                            if re.match(r'^\d+', cell):
                                old_section = cell
                                break

                        # Default old_section to prefix if not found
                        if not old_section:
                            old_section = ""

                        chunk_id = f"{file_name.replace('.pdf','')}_{new_section}"

                        chunks_to_add = {
                            "source_file": file_name,
                            "chunk_id": chunk_id,
                            "fields": {
                                "New_Law_Section": f"{new_law_prefix} {new_section}",
                                "Old_Law_Section": f"{old_law_prefix} {old_section}" if old_section else old_law_prefix,
                                "Subject": subject,
                                "Summary_of_comparison": summary
                            }
                        }

                        all_chunks.append(chunks_to_add)

        print(f"✅ Extracted {len(all_chunks)} rows from {file_path}")

    output_json = os.path.join(DATA_DIR, "mapping_of_laws.json")
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, ensure_ascii=False, indent=2)

    print(f"📦 Saved mappings to → {output_json} ({len(all_chunks)} rows total)")

# =========================
# Run
# =========================
if __name__ == "__main__":
    process_mappings()


Looking for PDFs in: /home/prince/Desktop/llm-project/LegalMind/LegalMind/data
Files found: ['IPC.txt', 'IEA.txt', 'BSA to IEA.txt', 'BNS_to_IPC.pdf', 'BNSS.txt', 'crpc.json', 'BNSS to CrPC.txt', 'BSA.txt', 'BSA_to_IEA.pdf', 'BNS.txt', 'bns.json', 'CrPC.txt', 'bnss.json', 'ipc.json', 'bsa.json', 'iea.json', 'BNSS_to_CrPC.pdf', 'BNS to IPC.txt']
✅ Extracted 532 rows from /home/prince/Desktop/llm-project/LegalMind/LegalMind/data/BNS_to_IPC.pdf
✅ Extracted 711 rows from /home/prince/Desktop/llm-project/LegalMind/LegalMind/data/BSA_to_IEA.pdf
✅ Extracted 1237 rows from /home/prince/Desktop/llm-project/LegalMind/LegalMind/data/BNSS_to_CrPC.pdf
📦 Saved mappings to → /home/prince/Desktop/llm-project/LegalMind/LegalMind/data/mapping_of_laws.json (1237 rows total)


In [22]:
import os
import openai
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = openai.OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url="https://api.llm7.io/v1"
)