In [129]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
import re

In [130]:
loader = PyPDFLoader('constitution.pdf')

In [131]:
page_ranges = {
    "Part II - Citizenship": (34, 35),
    "Part III - Fundamental Rights": (36, 50),
    "Part VI - High Courts (Arts. 214-231)": (101, 140),
    "Part IX - Panchayats": (149, 156),
    "Part IXA - Municipalities": (157, 166),
    "Part XIVA - Tribunals": (214, 215),
}

In [132]:
docs = loader.load()

In [133]:
raw_sections = {}

for section, (start, end) in page_ranges.items():
    pages = []
    for doc in docs:
        page_num = doc.metadata["page"]
        if start <= page_num <= end:
            text = doc.page_content or ""
            lines = text.splitlines()

            # 🧼 Remove top two lines if they match page number + heading pattern
            if len(lines) >= 2:
                first_line = lines[0].strip()
                second_line = lines[1].strip()

                if (
                    re.match(r"^\d{1,3}\s+THE CONSTITUTION OF INDIA", first_line, re.IGNORECASE)
                    and re.match(r"^\(Part\s+[IVXLCDM]+\.", second_line, re.IGNORECASE)
                ):
                    lines = lines[2:]  # Remove both lines

            cleaned_text = "\n".join(lines).strip()
            pages.append(cleaned_text)

    raw_sections[section] = "\n".join(pages).strip()

In [134]:
def remove_embedded_headers(text):
    """
    Removes in-text page headers like:
    - THE CONSTITUTION OF INDIA(Part XYZ.—Title)123
    - THE CONSTITUTION OF INDIA
      (Part XYZ.—Title) 123
    - PART XYZ.—TITLE (standalone)
    """

    # 🔹 Case 1: All-in-one header line (with page number at end)
    text = re.sub(
        r"THE CONSTITUTION OF\s+INDIA\s*\(Part\s+[IVXLCDMA]+\s*\.?—.*?\)\s*\d{1,3}",
        "",
        text,
        flags=re.IGNORECASE
    )

    # 🔹 Case 2: Two-line headers (split header + page number)
    text = re.sub(
        r"THE CONSTITUTION OF\s+INDIA\s*\n?\s*\(Part\s+[IVXLCDMA]+\s*\.?—.*?\)\s*\n?\s*\d{1,3}",
        "",
        text,
        flags=re.IGNORECASE
    )

    # 🔹 Case 3: Standalone uppercase lines like "PART IXA.—THE MUNICIPALITIES"
    text = re.sub(
        r"^PART\s+[IVXLCDMA]+\s*\.?—.*$", "", text, flags=re.IGNORECASE | re.MULTILINE
    )

    # 🔹 Case 4: Standalone lines with only a number (page numbers)
    text = re.sub(r"^\s*\d{1,3}\s*$", "", text, flags=re.MULTILINE)
    
    text = re.sub(
    r"^PART\s*[IVXLCDMA]+\s*[A-Z\s]{3,}$", "", text, flags=re.IGNORECASE | re.MULTILINE
    )

    return text.strip()

In [135]:
def clean_footnotes(text):
    """
    Removes amendment-style footnotes and annotations without touching valid article text.
    """

    # ✅ Remove only lines that:
    # - Start with optional underscores/spaces
    # - Start with a number + dot (1. ... 99.)
    # - Contain footnote keywords
    # - Are *not* article numbers like 14. Right to equality
    lines = text.splitlines()
    cleaned_lines = []

    for line in lines:
        stripped = line.strip()

        # Match real amendment-style footnote lines only
        is_footnote = re.match(
            r"^[_\s]*\d{1,2}\.\s+(Ins\.|Subs\.|Amendment|Act|w\.e\.f\.|C\.O\.|ibid|sub-clause|re-lettered|struck down|omitted|inserted|Supreme Court)",
            stripped,
            flags=re.IGNORECASE
        )

        if not is_footnote:
            cleaned_lines.append(line)

    # Remove long underlines
    text = "\n".join(cleaned_lines)
    text = re.sub(r"_{5,}", "", text)  # Only removes long underlines (not em dashes etc.)

    return text.strip()

In [136]:
for title in raw_sections:
    raw_sections[title] = remove_embedded_headers(raw_sections[title])
    raw_sections[title] = clean_footnotes(raw_sections[title])

In [137]:
for title, content in raw_sections.items():
    print(f"\n--- {title} ---")
    print(content, "...\n")
    # For testing purpose


--- Part II - Citizenship ---
PART IICITIZENSHIP 5. Citizenship at the commencement of the Constitution.—At the commencement of this Constitution, every person who has his domicile in the territory of India and—(a) who was born in the territory of India; or (b) either of whose parents was born in the territory of India; or(c) who has been ordinarily resident in the territory of India for not less than five years immediately preceding such commencement,  shall be a citizen of India.6. Rights of citizenship of certain persons who have migrated to India from Pakistan.—Notwithstanding anything in article 5, a person who has migrated to the territory of India from the territory now included in Pakistan shall be deemed to be a citizen of India at the commencement of this Constitution if—(a) he or either of his parents or any of his grand-parents was born in India as defined in the Government of India Act, 1935 (as originally enacted); and(b)(i) in the case where such person has so migrated 

In [138]:
def extract_articles(section_text: str, part_title: str):
    """
    Splits the given section text into structured articles with metadata.
    """
    articles = []

    # Regex to find article headers: "14. Title—"
    article_pattern = re.compile(r"(?=\n?\s*(\d+[A-Z]?)\.\s+([^\n—]+)[—\-–])")

    # Find all matches and their start positions
    matches = list(article_pattern.finditer(section_text))

    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(section_text)

        article_number = match.group(1).strip()
        article_text = section_text[start:end].strip()

        articles.append({
            "text": article_text,
            "metadata": {
                "article": article_number,
                "part": part_title,
                "tags": []  # optional: add custom tag inference later
            }
        })

    return articles

In [139]:
all_articles = []

for part_title, section_text in raw_sections.items():
    if "Schedule VII" in part_title:
        # Split into List I, II, III
        schedule_pattern = r"(List\s+I\s*[--]?\s*Union List|List\s+II\s*[--]?\s*State List|List\s+III\s*[--]?\s*Concurrent List)"
        lists = re.split(schedule_pattern, section_text)
        lists = [l.strip() for l in lists if l.strip()]
        for i in range(1, len(lists), 2):
            list_name = lists[i].strip()
            list_content = lists[i + 1].strip()
            all_articles.append({
                "text": f"{list_name}\n{list_content}",
                "metadata": {
                    "article": list_name,
                    "part": part_title,
                    "tags": []
                }
            })
    else:
        articles = extract_articles(section_text, part_title)
        all_articles.extend(articles)

In [142]:
# testing
for i, article in enumerate(all_articles):
    print("Article Number:", article["metadata"].get("article", "N/A"))
    print("Part          :", article["metadata"].get("part", "N/A"))
    print("Tags          :", article["metadata"].get("tags", []))
    print("Text (first 300 chars):")
    print(article["text"], "...")


Article Number: 5
Part          : Part II - Citizenship
Tags          : []
Text (first 300 chars):
 ...
Article Number: 5
Part          : Part II - Citizenship
Tags          : []
Text (first 300 chars):
5. Citizenship at the commencement of the Constitution.—At the commencement of this Constitution, every person who has his domicile in the territory of India and—(a) who was born in the territory of India; or (b) either of whose parents was born in the territory of India; or(c) who has been ordinarily resident in the territory of India for not less than five years immediately preceding such commencement,  shall be a citizen of India. ...
Article Number: 6
Part          : Part II - Citizenship
Tags          : []
Text (first 300 chars):
6. Rights of citizenship of certain persons who have migrated to India from Pakistan.—Notwithstanding anything in article 5, a person who has migrated to the territory of India from the territory now included in Pakistan shall be deemed to be a citizen of 