In [1]:
import os
from pathlib import Path
import re
import json
from collections import defaultdict, OrderedDict
import pprint

import docx
import PyPDF2
import pandas as pd
from bs4 import BeautifulSoup
import markdown

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [2]:
# Ensure NLTK data required is available (run once)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)

True

## Preprocesses a collection of at least 10 legal documents . Apply tokenization, stop word removal, and stemming or lemmatization

In [3]:
lemmatizer = WordNetLemmatizer()

# Helpful pretty-printer
pp = pprint.PrettyPrinter(width=140)

In [4]:
# Part 2: File reading utilities
def read_txt(path):
    return Path(path).read_text(encoding='utf8', errors='ignore')

def read_docx(path):
    doc = docx.Document(path)
    return "\n".join(p.text for p in doc.paragraphs)

def read_pdf(path):
    text_parts = []
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            txt = page.extract_text()
            if txt:
                text_parts.append(txt)
    return "\n".join(text_parts)

def read_csv(path):
    try:
        df = pd.read_csv(path, encoding='utf8', dtype=str, keep_default_na=False)
        # concat rows and columns to a single text
        rows = []
        for _, row in df.iterrows():
            rows.append(" ".join([str(x) for x in row.values if str(x).strip()]))
        return "\n".join(rows)
    except Exception:
        return read_txt(path)

READERS = {
    ".txt": read_txt,
    ".md": read_txt,
    ".docx": read_docx,
    ".pdf": read_pdf,
    ".csv": read_csv
}

def read_file_auto(path):
    p = Path(path)
    reader = READERS.get(p.suffix.lower(), read_txt)
    return reader(path)

In [5]:
# Part 3: Preprocessing helpers and pipeline

# Use NLTK English stopwords (you can extend with legal stopwords if desired)
STOPWORDS = set(stopwords.words('english'))

TOKEN_RE = re.compile(r"\w+")  # word tokens (alphanumeric)

In [6]:
def tokenize_text(text):
    # returns list of tokens lowercased
    return TOKEN_RE.findall(text.lower())

def lemmatize_list(tokens):
    # POS-agnostic lemmatization using WordNet lemmatizer (fast and portable)
    return [lemmatizer.lemmatize(t) for t in tokens]

def preprocess_document(path, doc_id=None):
    """
    Reads file, splits into paragraphs, sentences, full tokens, and builds position metadata:
    returns dict with keys: doc_id, text, paragraphs, sentences, tokens, lemmas, pos_to_sent, pos_to_para
    """
    text = read_file_auto(path)
    paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
    if not paragraphs:
        paragraphs = [text.strip()]

    # Build sentences list while maintaining paragraph boundaries:
    sentences = []
    sent_to_para = []  # map sentence index -> paragraph index
    for p_idx, para in enumerate(paragraphs):
        sents = sent_tokenize(para)
        for s in sents:
            sentences.append(s)
            sent_to_para.append(p_idx)

    # Full tokens across entire document (we'll use these positions)
    tokens = tokenize_text(text)            # original normalized tokens (lowercased)
    lemmas = lemmatize_list(tokens)        # lemmatized tokens aligned with tokens

    # Map token positions to sentence index and paragraph index.
    # We'll compute by scanning sentences and matching tokens in order to maintain robust mapping.
    pos_to_sent = {}   # token_position -> sentence_index
    pos_to_para = {}   # token_position -> paragraph_index
    token_index = 0
    for s_idx, s in enumerate(sentences):
        s_tokens = tokenize_text(s)
        for _ in s_tokens:
            # assign mapping for this token position
            pos_to_sent[token_index] = s_idx
            pos_to_para[token_index] = sent_to_para[s_idx]
            token_index += 1
    # Note: token counts from tokenize_text(text) and sum of sentence tokens should match in normal text.
    # If mismatch, we still keep mappings for positions we could assign.

    if doc_id is None:
        doc_id = Path(path).name

    return {
        "doc_id": doc_id,
        "path": str(path),
        "text": text,
        "paragraphs": paragraphs,
        "sentences": sentences,
        "tokens": tokens,
        "lemmas": lemmas,
        "pos_to_sent": pos_to_sent,
        "pos_to_para": pos_to_para
    }


## Construct a Positional inverted index that maps each term to document IDs and positional occurrences.(2)

In [14]:
# Part 4: Build positional inverted index

def build_positional_inverted_index(docs_preprocessed):
    """
    docs_preprocessed: list of dicts returned by preprocess_document
    Returns: inverted_index: dict { term: { doc_id: [positions] } }
    """
    index = defaultdict(lambda: defaultdict(list))
    for doc in docs_preprocessed:
        doc_id = doc['doc_id']
        lemmas = doc['lemmas']
        for pos, lemma in enumerate(lemmas):
            if lemma in STOPWORDS:
                continue
            index[lemma][doc_id].append(pos)
    # Convert postings lists to sorted lists
    for term in list(index.keys()):
        for doc_id in index[term]:
            index[term][doc_id] = sorted(index[term][doc_id])
    return index

def sorted_inverted_index_repr(index):
    # produce OrderedDict sorted by term
    return OrderedDict(sorted(((term, dict(sorted(postings.items()))) for term, postings in index.items()), key=lambda x: x[0]))

## Support Proximity-Based Search Operators - Implement query functionality that supports the following proximity search operators:
#### term1 /n term2: term1 and term2 must appear within n words of each other.
#### term1 /s term2: term1 and term2 must appear in the same sentence.
#### term1 /p term2: term1 and term2 must appear in the same paragraph.
##### Display Matching document IDs and Matching snippet text where the terms occur together.

In [15]:
# Part 5: Proximity and snippet utilities

def snippet_from_sent_idx(doc, sent_idx):
    # safe guard
    if 0 <= sent_idx < len(doc['sentences']):
        return doc['sentences'][sent_idx].strip()
    return ""

def snippet_from_para_idx(doc, para_idx):
    if 0 <= para_idx < len(doc['paragraphs']):
        return doc['paragraphs'][para_idx].strip()
    return ""

def proximity_within_n(index, term1, term2, n):
    """
    term1 /n term2: return list of (doc_id, pairs_of_positions, snippet)
    Uses two-pointer algorithm for positions lists.
    """
    t1 = term1.lower()
    t2 = term2.lower()
    results = []
    postings1 = index.get(t1, {})
    postings2 = index.get(t2, {})
    common_docs = set(postings1.keys()) & set(postings2.keys())
    for doc_id in common_docs:
        p1 = postings1[doc_id]
        p2 = postings2[doc_id]
        i, j = 0, 0
        pairs = []
        # two-pointer to find all position pairs whose distance <= n
        while i < len(p1) and j < len(p2):
            a, b = p1[i], p2[j]
            if a == b:
                # same position (rare), skip
                if a < b:
                    i += 1
                else:
                    j += 1
                continue
            if abs(a - b) <= n:
                pairs.append((a, b))
                # advance the smaller pointer to seek other combos
                if a < b:
                    i += 1
                else:
                    j += 1
            else:
                if a < b:
                    i += 1
                else:
                    j += 1
        if pairs:
            results.append((doc_id, pairs))
    return results

def proximity_same_sentence(index, docs_by_id, term1, term2):
    """
    term1 /s term2: check if both lemmas exist in the same sentence.
    We'll use pos_to_sent mapping to determine sentence membership.
    Returns list of (doc_id, sent_index, snippet)
    """
    t1 = term1.lower()
    t2 = term2.lower()
    results = []
    postings1 = index.get(t1, {})
    postings2 = index.get(t2, {})
    common_docs = set(postings1.keys()) & set(postings2.keys())
    for doc_id in common_docs:
        doc = docs_by_id[doc_id]
        # build set of sentence indices containing term1 and term2
        sent_idxs_t1 = { doc['pos_to_sent'].get(pos) for pos in postings1[doc_id] }
        sent_idxs_t2 = { doc['pos_to_sent'].get(pos) for pos in postings2[doc_id] }
        # intersection ignoring None
        intersect = {s for s in sent_idxs_t1 if s is not None} & {s for s in sent_idxs_t2 if s is not None}
        for sidx in sorted(intersect):
            results.append((doc_id, sidx, snippet_from_sent_idx(doc, sidx)))
    return results

def proximity_same_paragraph(index, docs_by_id, term1, term2):
    """
    term1 /p term2: check if both lemmas exist in the same paragraph.
    """
    t1 = term1.lower()
    t2 = term2.lower()
    results = []
    postings1 = index.get(t1, {})
    postings2 = index.get(t2, {})
    common_docs = set(postings1.keys()) & set(postings2.keys())
    for doc_id in common_docs:
        doc = docs_by_id[doc_id]
        para_idxs_t1 = { doc['pos_to_para'].get(pos) for pos in postings1[doc_id] }
        para_idxs_t2 = { doc['pos_to_para'].get(pos) for pos in postings2[doc_id] }
        intersect = {p for p in para_idxs_t1 if p is not None} & {p for p in para_idxs_t2 if p is not None}
        for pidx in sorted(intersect):
            results.append((doc_id, pidx, snippet_from_para_idx(doc, pidx)))
    return results

def extract_snippet_from_positions(doc, pos_pair, window_tokens=8):
    # returns a short snippet (tokens) around the positions
    tokens = doc['tokens']
    a, b = pos_pair
    start = max(0, min(a, b) - window_tokens)
    end = min(len(tokens), max(a, b) + window_tokens + 1)
    return " ".join(tokens[start:end])


## Implement Phrase Query Support - that:
##### ·        Accepts a query like "right to counsel" or "due process"

##### ·        Returns documents where all terms in the phrase appear in the same order and consecutively

In [16]:
# Part 6: Phrase query using positional index

def phrase_query(index, docs_by_id, phrase):
    """
    phrase: string like "right to counsel"
    Steps:
     - tokenize+lemmatize phrase terms (using same pipeline)
     - for candidate docs that contain first term, scan positions to see if consecutive positions match phrase lemmas
    Returns: list of (doc_id, snippet)
    """
    phrase_tokens = [t for t in TOKEN_RE.findall(phrase.lower()) if t not in STOPWORDS]
    if not phrase_tokens:
        return []
    phrase_lemmas = [lemmatizer.lemmatize(t) for t in phrase_tokens]
    first = phrase_lemmas[0]
    results = []
    postings_first = index.get(first, {})
    for doc_id, positions in postings_first.items():
        doc = docs_by_id[doc_id]
        # To avoid scanning entire doc tokens for each phrase naive way, we'll use position membership check:
        # Build a set for quick lookup of lemma at position
        # We rely on doc['lemmas'] aligned to token positions.
        lemmas = doc['lemmas']
        L = len(phrase_lemmas)
        for p in positions:
            # check bounds
            if p + L > len(lemmas):
                continue
            ok = True
            for offset in range(L):
                if lemmas[p + offset] != phrase_lemmas[offset]:
                    ok = False
                    break
            if ok:
                # found exact consecutive phrase starting at p
                # get sentence snippet for the sentence containing p
                s_idx = doc['pos_to_sent'].get(p)
                snippet = snippet_from_sent_idx(doc, s_idx) if s_idx is not None else extract_snippet_from_positions(doc, (p, p+L-1))
                results.append((doc_id, snippet))
                break  # one match per doc is sufficient
    return results

In [17]:
# Part 7: Full pipeline: load files, preprocess, build index, display inverted index, run queries

DATA_DIR = Path("./data")  # ensure your 10 files are here

# 1. Collect files
files = sorted([p for p in DATA_DIR.iterdir() if p.is_file()])
print("Files found:", len(files))
for f in files:
    print(" -", f.name)

Files found: 9
 - BenchOpinion.md
 - indian_supreme_court_judgments.csv
 - Martin_v_Franklin_Capital_Corp.md
 - Sanjay Rajpoot vs Ram Singh on 11 F.txt
 - SanjayDuttvsTheStateOfHaryana.txt
 - Vinay_Aggarwal_vs_State_Of_Haryana_on_2_April_2025_1.docx
 - Vipin_Kumar_vs_Jaydeep_on_21_January_2025_1.docx
 - Vishal_Shah_vs_Monalisha_Gupta_on_20_February_2025_1.PDF
 - Vishnoo_Mittal_vs_M_S_Shakti_Trading_Company_on_17_March_2025_1.PDF


In [18]:
# 2. Preprocess all files
docs = []
docs_by_id = {}
for p in files:
    doc = preprocess_document(p, doc_id=p.name)
    docs.append(doc)
    docs_by_id[doc['doc_id']] = doc

In [10]:
print("\nPreprocessing complete. Example document keys and sizes:")
for doc in docs:
    print(f"Doc: {doc['doc_id']}, tokens: {len(doc['tokens'])}, sentences: {len(doc['sentences'])}, paragraphs: {len(doc['paragraphs'])}")


Preprocessing complete. Example document keys and sizes:
Doc: BenchOpinion.md, tokens: 817, sentences: 34, paragraphs: 4
Doc: indian_supreme_court_judgments.csv, tokens: 5191404, sentences: 164023, paragraphs: 1
Doc: Martin_v_Franklin_Capital_Corp.md, tokens: 870, sentences: 55, paragraphs: 26
Doc: Sanjay Rajpoot vs Ram Singh on 11 F.txt, tokens: 1287, sentences: 57, paragraphs: 1
Doc: SanjayDuttvsTheStateOfHaryana.txt, tokens: 3525, sentences: 129, paragraphs: 1
Doc: Vinay_Aggarwal_vs_State_Of_Haryana_on_2_April_2025_1.docx, tokens: 1966, sentences: 66, paragraphs: 1
Doc: Vipin_Kumar_vs_Jaydeep_on_21_January_2025_1.docx, tokens: 4754, sentences: 209, paragraphs: 1
Doc: Vishal_Shah_vs_Monalisha_Gupta_on_20_February_2025_1.PDF, tokens: 6866, sentences: 424, paragraphs: 1
Doc: Vishnoo_Mittal_vs_M_S_Shakti_Trading_Company_on_17_March_2025_1.PDF, tokens: 2505, sentences: 77, paragraphs: 1


In [19]:

# 3. Build positional inverted index
index = build_positional_inverted_index(docs)

# 4. Display sorted inverted index (terms in alphabetical order)
sorted_index = sorted_inverted_index_repr(index)
print("\n--- Inverted Index (first 60 terms shown) ---")
count = 0
for term, postings in sorted_index.items():
    print(f"{term} -> {postings}")
    count += 1
    if count >= 60:
        break
print(f"... (vocab size = {len(sorted_index)})")

# 5. Example Proximity queries
print("\n=== Example Proximity Queries ===")


--- Inverted Index (first 60 terms shown) ---
0 -> {'indian_supreme_court_judgments.csv': [139316, 148309, 329832, 329951, 331472, 331591, 355617, 355619, 355621, 355622, 355623, 370317, 370319, 370321, 370322, 370323, 385027, 385029, 385031, 385032, 385033, 399727, 399729, 399731, 399732, 399733, 550115, 550590, 555551, 556026, 560997, 561472, 566433, 566908, 651509, 659494, 685502, 685534, 685768, 685824, 696128, 696711, 701736, 701768, 702002, 702058, 712362, 712945, 717980, 718012, 718246, 718302, 728606, 729189, 734214, 734246, 734480, 734536, 744840, 745423, 767572, 774283, 845499, 847643, 853864, 862396, 866203, 866241, 866267, 866354, 866370, 866379, 866393, 866501, 866574, 866576, 866581, 866583, 866601, 866603, 866611, 866613, 866625, 866627, 866703, 867827, 867865, 867891, 867978, 867994, 868003, 868017, 868125, 868198, 868200, 868205, 868207, 868225, 868227, 868235, 868237, 868249, 868251, 868327, 893159, 896127, 1074725, 1080480, 1203812, 1203823, 1203978, 1205463, 120612

In [20]:
# Example: counsel /10 ineffective
q1 = ("counsel", "/10", "ineffective")
if re.match(r"/\d+", q1[1]):
    n = int(q1[1][1:])
    res = proximity_within_n(index, q1[0].lower(), q1[2].lower(), n)
    print(f"\nQuery: {q1[0]} {q1[1]} {q1[2]} -> Matches: {len(res)}")
    for doc_id, pairs in res:
        doc = docs_by_id[doc_id]
        snippet = extract_snippet_from_positions(doc, pairs[0])
        print(f"{doc_id}: pairs={pairs[:5]} snippet=\"{snippet}\"")


Query: counsel /10 ineffective -> Matches: 1
indian_supreme_court_judgments.csv: pairs=[(2161476, 2161469)] snippet="thereby rendering the hearing itself as meaningless and ineffective 34 mr sanjay hegde learned senior counsel appearing for the petitioners further submitted that the"


In [21]:
# Example: due /s process
res_s = proximity_same_sentence(index, docs_by_id, "due", "process")
print(f"\nQuery: due /s process -> Matches: {len(res_s)}")
for doc_id, sidx, snippet in res_s:
    print(f"{doc_id}: sentence_index={sidx} snippet=\"{snippet}\"")


Query: due /s process -> Matches: 59
indian_supreme_court_judgments.csv: sentence_index=1812 snippet="That in respect of violation of various provisions of the MMDR Act and the Rules made thereunder, when a Magistrate passes an order under Section 156(3) of the Code and directs the In-charge/SHO of the police station concerned to register/lodge the crime case/FIR in respect of the violation of various provisions of the Act and the Rules made thereunder and thereafter after investigation the In-charge of the police station/investigating officer concerned submits a report, the same can be sent to the Magistrate concerned as well as to the authorised officer concerned as mentioned in Section 22 of the MMDR Act and thereafter the authorised officer concerned may file the complaint before the learned Magistrate along with the report submitted by the investigating officer concerned and thereafter it will be open for the learned Magistrate to take cognizance after following due procedure, is

In [21]:
# Example: child /p visitation
res_p = proximity_same_paragraph(index, docs_by_id, "child", "visitation")
print(f"\nQuery: child /p visitation -> Matches: {len(res_p)}")
for doc_id, pidx, snippet in res_p:
    print(f"{doc_id}: para_index={pidx} snippet=\"{snippet[:200]}\"")


Query: child /p visitation -> Matches: 1
indian_supreme_court_judgments.csv: para_index=0 snippet="5-2021_MA-000083-2021 the Government of India the 2020-01-09 background in which the above applications have been filed. The definition of ‘gross revenue’ as defined in Clause 19.1 of the licence agre"


In [22]:
# 6. Example phrase queries
print("\n=== Example Phrase Queries ===")
phrases = ["right to counsel", "due process"]
for ph in phrases:
    res_ph = phrase_query(index, docs_by_id, ph)
    print(f"\nPhrase: \"{ph}\" -> Matches: {len(res_ph)}")
    for doc_id, snippet in res_ph:
        print(f"{doc_id}: \"{snippet}\"")


=== Example Phrase Queries ===

Phrase: "right to counsel" -> Matches: 0

Phrase: "due process" -> Matches: 1
indian_supreme_court_judgments.csv: "The impugned orders were passed by the Family Court without following due process of law and in breach of principles of natural justice, in the matters of discharging his advocate and not issuing notice to the appellant even thereafter, calling upon him to make alternative arrangements, and moreso in allowing transposition of the respondent as petitioner and appellant as respondent and on the same day to declare her (respondent) as the sole, exclusive and absolute guardian and custodian of the minor child."
