In [36]:
import os
from pathlib import Path
import re
import json
from collections import defaultdict, OrderedDict
import pprint

import docx
import PyPDF2
import pandas as pd
from bs4 import BeautifulSoup
import markdown

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [37]:
# Ensure NLTK data required is available (run once)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)

True

## Preprocesses a collection of at least 10 legal documents . Apply tokenization, stop word removal, and stemming or lemmatization

In [38]:
lemmatizer = WordNetLemmatizer()

# Helpful pretty-printer
 
pp = pprint.PrettyPrinter(width=140)

def print_doc_summary(doc, tokens_preview=30):
    """Print a compact, readable summary of a preprocessed document dict."""
    print('\n' + '='*100)
    print(f"Document: {doc.get('doc_id')}")
    print('-'*100)
    print(f"Path: {doc.get('path')}")
    print(f"Tokens: {len(doc.get('tokens', []))}, Sentences: {len(doc.get('sentences', []))}, Paragraphs: {len(doc.get('paragraphs', []))}")
    print()
    tokens = doc.get('tokens', [])
    lemmas = doc.get('lemmas', [])
    preview = ' '.join(tokens[:tokens_preview])
    preview_lemmas = ' '.join(lemmas[:tokens_preview])
    print('Tokens preview:')
    print(preview)
    print()
    print('Lemmas preview:')
    print(preview_lemmas)
    print('='*100 + '\n')

In [39]:
# Part 2: File reading utilities
def read_txt(path):
    return Path(path).read_text(encoding='utf8', errors='ignore')

def read_docx(path):
    doc = docx.Document(path)
    return "\n".join(p.text for p in doc.paragraphs)

def read_pdf(path):
    text_parts = []
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            txt = page.extract_text()
            if txt:
                text_parts.append(txt)
    return "\n".join(text_parts)

def read_csv(path):
    try:
        df = pd.read_csv(path, encoding='utf8', dtype=str, keep_default_na=False)
        # concat rows and columns to a single text
        rows = []
        for _, row in df.iterrows():
            rows.append(" ".join([str(x) for x in row.values if str(x).strip()]))
        return "\n".join(rows)
    except Exception:
        return read_txt(path)

READERS = {
    ".txt": read_txt,
    ".md": read_txt,
    ".docx": read_docx,
    ".pdf": read_pdf,
    ".csv": read_csv
}

def read_file_auto(path):
    p = Path(path)
    reader = READERS.get(p.suffix.lower(), read_txt)
    return reader(path)

In [40]:
# Part 3: Preprocessing helpers and pipeline

# Use NLTK English stopwords (you can extend with legal stopwords if desired)
STOPWORDS = set(stopwords.words('english'))

TOKEN_RE = re.compile(r"\w+")  # word tokens (alphanumeric)

In [41]:
def tokenize_text(text):
    # returns list of tokens lowercased
    return TOKEN_RE.findall(text.lower())

def lemmatize_list(tokens):
    # POS-agnostic lemmatization using WordNet lemmatizer (fast and portable)
    return [lemmatizer.lemmatize(t) for t in tokens]

def preprocess_document(path, doc_id=None):
    """
    Reads file, splits into paragraphs, sentences, full tokens, and builds position metadata:
    returns dict with keys: doc_id, text, paragraphs, sentences, tokens, lemmas, pos_to_sent, pos_to_para
    """
    text = read_file_auto(path)
    paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
    if not paragraphs:
        paragraphs = [text.strip()]

    # Build sentences list while maintaining paragraph boundaries:
    sentences = []
    sent_to_para = []  # map sentence index -> paragraph index
    for p_idx, para in enumerate(paragraphs):
        sents = sent_tokenize(para)
        for s in sents:
            sentences.append(s)
            sent_to_para.append(p_idx)

    # Full tokens across entire document (we'll use these positions)
    tokens = tokenize_text(text)            # original normalized tokens (lowercased)
    lemmas = lemmatize_list(tokens)        # lemmatized tokens aligned with tokens

    # Map token positions to sentence index and paragraph index.
    # We'll compute by scanning sentences and matching tokens in order to maintain robust mapping.
    pos_to_sent = {}   # token_position -> sentence_index
    pos_to_para = {}   # token_position -> paragraph_index
    token_index = 0
    for s_idx, s in enumerate(sentences):
        s_tokens = tokenize_text(s)
        for _ in s_tokens:
            # assign mapping for this token position
            pos_to_sent[token_index] = s_idx
            pos_to_para[token_index] = sent_to_para[s_idx]
            token_index += 1
    # Note: token counts from tokenize_text(text) and sum of sentence tokens should match in normal text.
    # If mismatch, we still keep mappings for positions we could assign.

    if doc_id is None:
        doc_id = Path(path).name

    return {
        "doc_id": doc_id,
        "path": str(path),
        "text": text,
        "paragraphs": paragraphs,
        "sentences": sentences,
        "tokens": tokens,
        "lemmas": lemmas,
        "pos_to_sent": pos_to_sent,
        "pos_to_para": pos_to_para
    }


## Construct a Positional inverted index that maps each term to document IDs and positional occurrences.(2)

In [42]:
# Part 4: Build positional inverted index

def build_positional_inverted_index(docs_preprocessed):
    """
    docs_preprocessed: list of dicts returned by preprocess_document
    Returns: inverted_index: dict { term: { doc_id: [positions] } }
    """
    index = defaultdict(lambda: defaultdict(list))
    for doc in docs_preprocessed:
        doc_id = doc['doc_id']
        lemmas = doc['lemmas']
        for pos, lemma in enumerate(lemmas):
            if lemma in STOPWORDS:
                continue
            index[lemma][doc_id].append(pos)
    # Convert postings lists to sorted lists
    for term in list(index.keys()):
        for doc_id in index[term]:
            index[term][doc_id] = sorted(index[term][doc_id])
    return index

def sorted_inverted_index_repr(index):
    # produce OrderedDict sorted by term
    return OrderedDict(sorted(((term, dict(sorted(postings.items()))) for term, postings in index.items()), key=lambda x: x[0]))

## Support Proximity-Based Search Operators - Implement query functionality that supports the following proximity search operators:
#### term1 /n term2: term1 and term2 must appear within n words of each other.
#### term1 /s term2: term1 and term2 must appear in the same sentence.
#### term1 /p term2: term1 and term2 must appear in the same paragraph.
##### Display Matching document IDs and Matching snippet text where the terms occur together.

In [43]:
# Part 5: Proximity and snippet utilities

def snippet_from_sent_idx(doc, sent_idx):
    # safe guard
    if 0 <= sent_idx < len(doc['sentences']):
        return doc['sentences'][sent_idx].strip()
    return ""

def snippet_from_para_idx(doc, para_idx):
    if 0 <= para_idx < len(doc['paragraphs']):
        return doc['paragraphs'][para_idx].strip()
    return ""

def build_highlighted_snippet(doc, pos_pair, window_tokens=8, bg_color='#fff59d'):
    """
    Build an HTML snippet around pos_pair (a,b). Returns (highlighted_html, sentence_index, paragraph_index, start_token_pos)
    """
    tokens = doc['tokens']
    a, b = pos_pair
    # ensure a <= b
    a0, b0 = min(a,b), max(a,b)
    start = max(0, a0 - window_tokens)
    end = min(len(tokens), b0 + window_tokens + 1)
    snippet_tokens = tokens[start:end]
    # compute local indices for highlight
    local_a = a0 - start
    local_b = local_a + (b0 - a0) + 1
    # escape HTML chars
    def esc(s):
        return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    pre = ' '.join(esc(t) for t in snippet_tokens[:local_a])
    phrase_text = ' '.join(esc(t) for t in snippet_tokens[local_a:local_b])
    post = ' '.join(esc(t) for t in snippet_tokens[local_b:])
    highlighted = f"{pre} <span style=\"background-color:{bg_color};color:#000;padding:2px 4px;border-radius:3px\">{phrase_text}</span> {post}".strip()
    s_idx = doc.get('pos_to_sent', {}).get(a0)
    p_idx = doc.get('pos_to_para', {}).get(a0)
    return highlighted, s_idx, p_idx, a0

def proximity_within_n(index, term1, term2, n):
    """
    term1 /n term2: return list of (doc_id, pairs_of_positions, snippet)
    Uses two-pointer algorithm for positions lists.
    """
    t1 = term1.lower()
    t2 = term2.lower()
    results = []
    postings1 = index.get(t1, {})
    postings2 = index.get(t2, {})
    common_docs = set(postings1.keys()) & set(postings2.keys())
    for doc_id in common_docs:
        p1 = postings1[doc_id]
        p2 = postings2[doc_id]
        i, j = 0, 0
        pairs = []
        # two-pointer to find all position pairs whose distance <= n
        while i < len(p1) and j < len(p2):
            a, b = p1[i], p2[j]
            if a == b:
                # same position (rare), skip
                if a < b:
                    i += 1
                else:
                    j += 1
                continue
            if abs(a - b) <= n:
                pairs.append((a, b))
                # advance the smaller pointer to seek other combos
                if a < b:
                    i += 1
                else:
                    j += 1
            else:
                if a < b:
                    i += 1
                else:
                    j += 1
        if pairs:
            results.append((doc_id, pairs))
    return results

def proximity_same_sentence(index, docs_by_id, term1, term2):
    """
    term1 /s term2: check if both lemmas exist in the same sentence.
    We'll use pos_to_sent mapping to determine sentence membership.
    Returns list of (doc_id, sent_index, snippet)
    """
    t1 = term1.lower()
    t2 = term2.lower()
    results = []
    postings1 = index.get(t1, {})
    postings2 = index.get(t2, {})
    common_docs = set(postings1.keys()) & set(postings2.keys())
    for doc_id in common_docs:
        doc = docs_by_id[doc_id]
        # build set of sentence indices containing term1 and term2
        sent_idxs_t1 = { doc['pos_to_sent'].get(pos) for pos in postings1[doc_id] }
        sent_idxs_t2 = { doc['pos_to_sent'].get(pos) for pos in postings2[doc_id] }
        # intersection ignoring None
        intersect = {s for s in sent_idxs_t1 if s is not None} & {s for s in sent_idxs_t2 if s is not None}
        for sidx in sorted(intersect):
            # optional: find representative positions within this sentence for highlighting
            # find a pos for term1 and term2 within sentence sidx
            pos1 = next((pos for pos in postings1[doc_id] if doc['pos_to_sent'].get(pos)==sidx), None)
            pos2 = next((pos for pos in postings2[doc_id] if doc['pos_to_sent'].get(pos)==sidx), None)
            # choose a representative pair if both found
            if pos1 is not None and pos2 is not None:
                # pick smaller-first ordering
                pair = (pos1, pos2)
                results.append((doc_id, sidx, pair))
            else:
                # fallback: return sidx without positions
                results.append((doc_id, sidx, None))
    return results

def proximity_same_paragraph(index, docs_by_id, term1, term2):
    """
    term1 /p term2: check if both lemmas exist in the same paragraph.
    """
    t1 = term1.lower()
    t2 = term2.lower()
    results = []
    postings1 = index.get(t1, {})
    postings2 = index.get(t2, {})
    common_docs = set(postings1.keys()) & set(postings2.keys())
    for doc_id in common_docs:
        doc = docs_by_id[doc_id]
        para_idxs_t1 = { doc['pos_to_para'].get(pos) for pos in postings1[doc_id] }
        para_idxs_t2 = { doc['pos_to_para'].get(pos) for pos in postings2[doc_id] }
        intersect = {p for p in para_idxs_t1 if p is not None} & {p for p in para_idxs_t2 if p is not None}
        for pidx in sorted(intersect):
            pos1 = next((pos for pos in postings1[doc_id] if doc['pos_to_para'].get(pos)==pidx), None)
            pos2 = next((pos for pos in postings2[doc_id] if doc['pos_to_para'].get(pos)==pidx), None)
            if pos1 is not None and pos2 is not None:
                results.append((doc_id, pidx, (pos1, pos2)))
            else:
                results.append((doc_id, pidx, None))
    return results

def extract_snippet_from_positions(doc, pos_pair, window_tokens=8):
    # returns a short snippet (tokens) around the positions (plain text)
    tokens = doc['tokens']
    a, b = pos_pair
    start = max(0, min(a, b) - window_tokens)
    end = min(len(tokens), max(a, b) + window_tokens + 1)
    return " ".join(tokens[start:end])

## Implement Phrase Query Support - that:
##### ·        Accepts a query like "right to counsel" or "due process"

##### ·        Returns documents where all terms in the phrase appear in the same order and consecutively

In [44]:
# Part 6: Phrase query using positional index

def phrase_query(index, docs_by_id, phrase, window_tokens=8):
    """
    phrase: string like "right to counsel"
    Steps:
     - tokenize+lemmatize phrase terms (using same pipeline)
     - for candidate docs that contain first term, scan positions to see if consecutive positions match phrase lemmas
    Returns: list of (doc_id, start_pos, sentence_index, paragraph_index, highlighted_snippet_html)
    """
    phrase_tokens = [t for t in TOKEN_RE.findall(phrase.lower()) if t not in STOPWORDS]
    if not phrase_tokens:
        return []
    phrase_lemmas = [lemmatizer.lemmatize(t) for t in phrase_tokens]
    first = phrase_lemmas[0]
    results = []
    postings_first = index.get(first, {})
    L = len(phrase_lemmas)
    for doc_id, positions in postings_first.items():
        doc = docs_by_id[doc_id]
        # We rely on doc['lemmas'] aligned to token positions.
        lemmas = doc['lemmas']
        tokens = doc['tokens']
        for p in positions:
            # check bounds
            if p + L > len(lemmas):
                continue
            ok = True
            for offset in range(L):
                if lemmas[p + offset] != phrase_lemmas[offset]:
                    ok = False
                    break
            if ok:
                # found exact consecutive phrase starting at p
                s_idx = doc['pos_to_sent'].get(p)
                para_idx = doc['pos_to_para'].get(p)
                # build a token-window snippet and highlight the phrase tokens
                start = max(0, p - window_tokens)
                end = min(len(tokens), p + L + window_tokens)
                snippet_tokens = tokens[start:end]
                a = p - start
                b = a + L
                # escape HTML special chars
                def esc(s):
                    return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                pre = ' '.join(esc(t) for t in snippet_tokens[:a])
                phrase_text = ' '.join(esc(t) for t in snippet_tokens[a:b])
                post = ' '.join(esc(t) for t in snippet_tokens[b:])
                highlighted = f"{pre} <span style=\"background-color:#fff59d;color:#000; padding:2px 4px; border-radius:3px\">{phrase_text}</span> {post}".strip()
                results.append((doc_id, p, s_idx, para_idx, highlighted))
                break  # one match per doc is sufficient
    return results

In [None]:
DATA_DIR = Path("./data")  # ensure your 10 files are here

# 1. Collect files
files = sorted([p for p in DATA_DIR.iterdir() if p.is_file()])
print('\nFiles found: {}\n'.format(len(files)))
for i, f in enumerate(files, 1):
    try:
        size = f.stat().st_size
    except Exception:
        size = 0
    print(f"{i:2d}. {f.name}  ({f.suffix})  size={size} bytes")

files_list = files  # keep reference for later cells


Files found: 9

 1. BenchOpinion.md  (.md)  size=5149 bytes
 2. indian_supreme_court_judgments.csv  (.csv)  size=32277049 bytes
 3. Martin_v_Franklin_Capital_Corp.md  (.md)  size=5224 bytes
 4. Sanjay Rajpoot vs Ram Singh on 11 F.txt  (.txt)  size=7429 bytes
 5. SanjayDuttvsTheStateOfHaryana.txt  (.txt)  size=21011 bytes
 6. Vinay_Aggarwal_vs_State_Of_Haryana_on_2_April_2025_1.docx  (.docx)  size=27539 bytes
 7. Vipin_Kumar_vs_Jaydeep_on_21_January_2025_1.docx  (.docx)  size=33532 bytes
 8. Vishal_Shah_vs_Monalisha_Gupta_on_20_February_2025_1.PDF  (.PDF)  size=289474 bytes
 9. Vishnoo_Mittal_vs_M_S_Shakti_Trading_Company_on_17_March_2025_1.PDF  (.PDF)  size=257506 bytes


In [46]:
# 2. Preprocess all files
docs = []
docs_by_id = {}
for i, p in enumerate(files, 1):
    print(f"Preprocessing {i}/{len(files)}: {p.name}")
    doc = preprocess_document(p, doc_id=p.name)
    docs.append(doc)
    docs_by_id[doc['doc_id']] = doc

print('\nPreprocessing finished for all files.')
if docs:
    print_doc_summary(docs[0])

Preprocessing 1/9: BenchOpinion.md
Preprocessing 2/9: indian_supreme_court_judgments.csv
Preprocessing 2/9: indian_supreme_court_judgments.csv
Preprocessing 3/9: Martin_v_Franklin_Capital_Corp.md
Preprocessing 4/9: Sanjay Rajpoot vs Ram Singh on 11 F.txt
Preprocessing 5/9: SanjayDuttvsTheStateOfHaryana.txt
Preprocessing 6/9: Vinay_Aggarwal_vs_State_Of_Haryana_on_2_April_2025_1.docx
Preprocessing 7/9: Vipin_Kumar_vs_Jaydeep_on_21_January_2025_1.docx
Preprocessing 8/9: Vishal_Shah_vs_Monalisha_Gupta_on_20_February_2025_1.PDF
Preprocessing 3/9: Martin_v_Franklin_Capital_Corp.md
Preprocessing 4/9: Sanjay Rajpoot vs Ram Singh on 11 F.txt
Preprocessing 5/9: SanjayDuttvsTheStateOfHaryana.txt
Preprocessing 6/9: Vinay_Aggarwal_vs_State_Of_Haryana_on_2_April_2025_1.docx
Preprocessing 7/9: Vipin_Kumar_vs_Jaydeep_on_21_January_2025_1.docx
Preprocessing 8/9: Vishal_Shah_vs_Monalisha_Gupta_on_20_February_2025_1.PDF
Preprocessing 9/9: Vishnoo_Mittal_vs_M_S_Shakti_Trading_Company_on_17_March_2025_1.PD

In [47]:
print('\nPreprocessing complete. Example document keys and sizes:')
for doc in docs:
    print(f" - {doc['doc_id']}: tokens={len(doc['tokens'])}, sentences={len(doc['sentences'])}, paragraphs={len(doc['paragraphs'])}")

print('\nShow detailed summary for first 2 documents:')
for doc in docs[:2]:
    print_doc_summary(doc, tokens_preview=50)


Preprocessing complete. Example document keys and sizes:
 - BenchOpinion.md: tokens=817, sentences=34, paragraphs=4
 - indian_supreme_court_judgments.csv: tokens=5191404, sentences=164023, paragraphs=1
 - Martin_v_Franklin_Capital_Corp.md: tokens=870, sentences=55, paragraphs=26
 - Sanjay Rajpoot vs Ram Singh on 11 F.txt: tokens=1287, sentences=57, paragraphs=1
 - SanjayDuttvsTheStateOfHaryana.txt: tokens=3525, sentences=129, paragraphs=1
 - Vinay_Aggarwal_vs_State_Of_Haryana_on_2_April_2025_1.docx: tokens=1966, sentences=66, paragraphs=1
 - Vipin_Kumar_vs_Jaydeep_on_21_January_2025_1.docx: tokens=4754, sentences=209, paragraphs=1
 - Vishal_Shah_vs_Monalisha_Gupta_on_20_February_2025_1.PDF: tokens=6866, sentences=424, paragraphs=1
 - Vishnoo_Mittal_vs_M_S_Shakti_Trading_Company_on_17_March_2025_1.PDF: tokens=2505, sentences=77, paragraphs=1

Show detailed summary for first 2 documents:

Document: BenchOpinion.md
-------------------------------------------------------------------------

In [48]:
# 3. Build positional inverted index
index = build_positional_inverted_index(docs)

# 4. Display sorted inverted index (terms in alphabetical order)
sorted_index = sorted_inverted_index_repr(index)
print('\n--- Inverted Index (first 60 terms with document frequency and sample postings) ---')
count = 0
for term, postings in sorted_index.items():
    df = len(postings)
    # show a small sample of postings (up to 3 docs)
    sample_postings = {doc_id: postings[doc_id] for doc_id in list(postings)[:3]}
    print('\n' + '-'*80)
    print(f"Term: '{term}' (df={df})")
    print('Sample postings:')
    pp.pprint(sample_postings)
    count += 1
    if count >= 60:
        break
print('\n' + '-'*80)
print(f"... (vocab size = {len(sorted_index)})\n")

# 5. Example Proximity queries
print('\n=== Example Proximity Queries ===')


--- Inverted Index (first 60 terms with document frequency and sample postings) ---

--------------------------------------------------------------------------------
Term: '0' (df=1)
Sample postings:
{'indian_supreme_court_judgments.csv': [139316,
                                        148309,
                                        329832,
                                        329951,
                                        331472,
                                        331591,
                                        355617,
                                        355619,
                                        355621,
                                        355622,
                                        355623,
                                        370317,
                                        370319,
                                        370321,
                                        370322,
                                        370323,
                               

In [49]:
# Example: counsel /10 ineffective
q1 = ("counsel", "/10", "ineffective")
if re.match(r"/\d+", q1[1]):
    n = int(q1[1][1:])
    res = proximity_within_n(index, q1[0].lower(), q1[2].lower(), n)
    print('\n' + '='*100)
    print(f"Query: {q1[0]} {q1[1]} {q1[2]} -> Matches: {len(res)}")
    print('-'*100)
    if not res:
        print('No matches found for this proximity query.')
    for doc_id, pairs in res:
        doc = docs_by_id[doc_id]
        # show up to 3 pairs per doc
        for pair in pairs[:3]:
            highlighted, sidx, pidx, token_pos = build_highlighted_snippet(doc, pair)
            print(f"Doc: {doc_id} | sentence_index={sidx} | paragraph_index={pidx} | token_start_pos={token_pos} | pair={pair}")
            display(HTML(f"<div style='font-family: monospace; background:#1f1f1f; color:#e6e6e6; padding:10px; border-radius:6px; margin:6px 0;'>{highlighted}</div>"))
            print()
    print('='*100)


Query: counsel /10 ineffective -> Matches: 1
----------------------------------------------------------------------------------------------------
Doc: indian_supreme_court_judgments.csv | sentence_index=67482 | paragraph_index=0 | token_start_pos=2161469 | pair=(2161476, 2161469)





In [50]:
# Example: due /s process
res_s = proximity_same_sentence(index, docs_by_id, "due", "process")
print('\n' + '='*100)
print(f"Query: due /s process -> Matches: {len(res_s)}")
print('-'*100)
if not res_s:
    print('No sentence-level matches found.')
for doc_id, sidx, pair in res_s:
    doc = docs_by_id[doc_id]
    if pair is not None:
        highlighted, s2, p2, token_pos = build_highlighted_snippet(doc, pair)
        # s2 should equal sidx; p2 is paragraph index
        print(f"Doc: {doc_id} | sentence_index={sidx} | paragraph_index={p2} | token_start_pos={token_pos}")
        display(HTML(f"<div style='font-family: monospace; background:#1f1f1f; color:#e6e6e6; padding:10px; border-radius:6px; margin:6px 0;'>{highlighted}</div>"))
    else:
        # fallback: show sentence text and indices
        sent_text = snippet_from_sent_idx(doc, sidx)
        pidx = doc.get('pos_to_para', {}).get(next(iter(index.get('due',{}).get(doc_id,[])), None))
        print(f"Doc: {doc_id} | sentence_index={sidx} | paragraph_index={pidx}")
        print(f"Sentence: {sent_text}\n")
    print()
print('='*100)


Query: due /s process -> Matches: 59
----------------------------------------------------------------------------------------------------
Doc: indian_supreme_court_judgments.csv | sentence_index=1812 | paragraph_index=0 | token_start_pos=44515



Doc: indian_supreme_court_judgments.csv | sentence_index=2938 | paragraph_index=0 | token_start_pos=81251



Doc: indian_supreme_court_judgments.csv | sentence_index=3164 | paragraph_index=0 | token_start_pos=89897



Doc: indian_supreme_court_judgments.csv | sentence_index=15274 | paragraph_index=0 | token_start_pos=430321



Doc: indian_supreme_court_judgments.csv | sentence_index=15276 | paragraph_index=0 | token_start_pos=430537



Doc: indian_supreme_court_judgments.csv | sentence_index=15333 | paragraph_index=0 | token_start_pos=432760



Doc: indian_supreme_court_judgments.csv | sentence_index=15497 | paragraph_index=0 | token_start_pos=438547



Doc: indian_supreme_court_judgments.csv | sentence_index=23991 | paragraph_index=0 | token_start_pos=776718



Doc: indian_supreme_court_judgments.csv | sentence_index=24408 | paragraph_index=0 | token_start_pos=786693



Doc: indian_supreme_court_judgments.csv | sentence_index=27822 | paragraph_index=0 | token_start_pos=881486



Doc: indian_supreme_court_judgments.csv | sentence_index=27914 | paragraph_index=0 | token_start_pos=883341



Doc: indian_supreme_court_judgments.csv | sentence_index=38616 | paragraph_index=0 | token_start_pos=1277096



Doc: indian_supreme_court_judgments.csv | sentence_index=38779 | paragraph_index=0 | token_start_pos=1280440



Doc: indian_supreme_court_judgments.csv | sentence_index=42732 | paragraph_index=0 | token_start_pos=1403075



Doc: indian_supreme_court_judgments.csv | sentence_index=43234 | paragraph_index=0 | token_start_pos=1411798



Doc: indian_supreme_court_judgments.csv | sentence_index=48268 | paragraph_index=0 | token_start_pos=1624649



Doc: indian_supreme_court_judgments.csv | sentence_index=48541 | paragraph_index=0 | token_start_pos=1633133



Doc: indian_supreme_court_judgments.csv | sentence_index=51048 | paragraph_index=0 | token_start_pos=1705388



Doc: indian_supreme_court_judgments.csv | sentence_index=51310 | paragraph_index=0 | token_start_pos=1713738



Doc: indian_supreme_court_judgments.csv | sentence_index=57301 | paragraph_index=0 | token_start_pos=1914588



Doc: indian_supreme_court_judgments.csv | sentence_index=57400 | paragraph_index=0 | token_start_pos=1916813



Doc: indian_supreme_court_judgments.csv | sentence_index=57936 | paragraph_index=0 | token_start_pos=1939353



Doc: indian_supreme_court_judgments.csv | sentence_index=58393 | paragraph_index=0 | token_start_pos=1947763



Doc: indian_supreme_court_judgments.csv | sentence_index=67226 | paragraph_index=0 | token_start_pos=2155727



Doc: indian_supreme_court_judgments.csv | sentence_index=71178 | paragraph_index=0 | token_start_pos=2268033



Doc: indian_supreme_court_judgments.csv | sentence_index=71449 | paragraph_index=0 | token_start_pos=2275292



Doc: indian_supreme_court_judgments.csv | sentence_index=78085 | paragraph_index=0 | token_start_pos=2459521



Doc: indian_supreme_court_judgments.csv | sentence_index=78093 | paragraph_index=0 | token_start_pos=2459726



Doc: indian_supreme_court_judgments.csv | sentence_index=78349 | paragraph_index=0 | token_start_pos=2468333



Doc: indian_supreme_court_judgments.csv | sentence_index=78357 | paragraph_index=0 | token_start_pos=2468538



Doc: indian_supreme_court_judgments.csv | sentence_index=80851 | paragraph_index=0 | token_start_pos=2533748



Doc: indian_supreme_court_judgments.csv | sentence_index=81038 | paragraph_index=0 | token_start_pos=2537506



Doc: indian_supreme_court_judgments.csv | sentence_index=84942 | paragraph_index=0 | token_start_pos=2637574



Doc: indian_supreme_court_judgments.csv | sentence_index=86858 | paragraph_index=0 | token_start_pos=2697071



Doc: indian_supreme_court_judgments.csv | sentence_index=87013 | paragraph_index=0 | token_start_pos=2699965



Doc: indian_supreme_court_judgments.csv | sentence_index=89032 | paragraph_index=0 | token_start_pos=2747910



Doc: indian_supreme_court_judgments.csv | sentence_index=89226 | paragraph_index=0 | token_start_pos=2754325



Doc: indian_supreme_court_judgments.csv | sentence_index=95233 | paragraph_index=0 | token_start_pos=2950331



Doc: indian_supreme_court_judgments.csv | sentence_index=95499 | paragraph_index=0 | token_start_pos=2956551



Doc: indian_supreme_court_judgments.csv | sentence_index=108619 | paragraph_index=0 | token_start_pos=3318279



Doc: indian_supreme_court_judgments.csv | sentence_index=108939 | paragraph_index=0 | token_start_pos=3326126



Doc: indian_supreme_court_judgments.csv | sentence_index=128440 | paragraph_index=0 | token_start_pos=4056002



Doc: indian_supreme_court_judgments.csv | sentence_index=137315 | paragraph_index=0 | token_start_pos=4324546



Doc: indian_supreme_court_judgments.csv | sentence_index=137492 | paragraph_index=0 | token_start_pos=4330316



Doc: indian_supreme_court_judgments.csv | sentence_index=149279 | paragraph_index=0 | token_start_pos=4645312



Doc: indian_supreme_court_judgments.csv | sentence_index=149342 | paragraph_index=0 | token_start_pos=4647759



Doc: indian_supreme_court_judgments.csv | sentence_index=149552 | paragraph_index=0 | token_start_pos=4653762



Doc: indian_supreme_court_judgments.csv | sentence_index=149615 | paragraph_index=0 | token_start_pos=4656209



Doc: indian_supreme_court_judgments.csv | sentence_index=154995 | paragraph_index=0 | token_start_pos=4850189



Doc: indian_supreme_court_judgments.csv | sentence_index=162005 | paragraph_index=0 | token_start_pos=5042021



Doc: indian_supreme_court_judgments.csv | sentence_index=162175 | paragraph_index=0 | token_start_pos=5046660



Doc: indian_supreme_court_judgments.csv | sentence_index=163494 | paragraph_index=0 | token_start_pos=5176793



Doc: indian_supreme_court_judgments.csv | sentence_index=163730 | paragraph_index=0 | token_start_pos=5182540



Doc: indian_supreme_court_judgments.csv | sentence_index=163814 | paragraph_index=0 | token_start_pos=5184602



Doc: indian_supreme_court_judgments.csv | sentence_index=163880 | paragraph_index=0 | token_start_pos=5186982



Doc: indian_supreme_court_judgments.csv | sentence_index=163886 | paragraph_index=0 | token_start_pos=5187206



Doc: indian_supreme_court_judgments.csv | sentence_index=163932 | paragraph_index=0 | token_start_pos=5188387



Doc: indian_supreme_court_judgments.csv | sentence_index=163998 | paragraph_index=0 | token_start_pos=5190767



Doc: indian_supreme_court_judgments.csv | sentence_index=164004 | paragraph_index=0 | token_start_pos=5190991





In [22]:
# Example: child /p visitation
from IPython.display import display, HTML
res_p = proximity_same_paragraph(index, docs_by_id, "child", "visitation")
print('\n' + '='*100)
print(f"Query: child /p visitation -> Matches: {len(res_p)}")
print('-'*100)
if not res_p:
    print('No paragraph-level matches found.')
for doc_id, pidx, pair in res_p:
    doc = docs_by_id[doc_id]
    if pair is not None:
        highlighted, s2, p2, token_pos = build_highlighted_snippet(doc, pair)
        print(f"Doc: {doc_id} | paragraph_index={p2} | sentence_index={s2} | token_start_pos={token_pos} | pair={pair}")
        display(HTML(f"<div style='font-family: monospace; background:#1f1f1f; color:#e6e6e6; padding:10px; border-radius:6px; margin:6px 0;'>{highlighted}</div>"))
    else:
        para_text = snippet_from_para_idx(doc, pidx)
        print(f"Doc: {doc_id} | paragraph_index={pidx}")
        print(f"Paragraph: {para_text[:400].strip()}\n")
    print()
print('='*100)


Query: child /p visitation -> Matches: 1
----------------------------------------------------------------------------------------------------
Doc: indian_supreme_court_judgments.csv | paragraph_index=0 | sentence_index=2370 | token_start_pos=62149 | pair=(62149, 779756)
Doc: indian_supreme_court_judgments.csv | paragraph_index=0 | sentence_index=2370 | token_start_pos=62149 | pair=(62149, 779756)





In [52]:
# 6. Example phrase queries
from IPython.display import display, HTML
print('\n=== Example Phrase Queries ===')
phrases = ["right to counsel", "due process"]
for ph in phrases:
    res_ph = phrase_query(index, docs_by_id, ph)
    print('\n' + '='*100)
    print(f"Phrase: \"{ph}\" -> Matches: {len(res_ph)}")
    print('-'*100)
    if not res_ph:
        print('No matches found for this phrase.')
    for doc_id, start_pos, sent_idx, para_idx, highlighted in res_ph:
        print(f"Doc: {doc_id} | sentence_index={sent_idx} | paragraph_index={para_idx} | token_start_pos={start_pos}")
        # display highlighted snippet as HTML for better readability in notebook
        display(HTML(f"<div style='font-family: monospace; background:#1f1f1f; color:#e6e6e6; padding:10px; border-radius:6px; margin:6px 0;'>{highlighted}</div>"))
        print()
    print('='*100)


=== Example Phrase Queries ===

Phrase: "right to counsel" -> Matches: 0
----------------------------------------------------------------------------------------------------
No matches found for this phrase.

Phrase: "due process" -> Matches: 1
----------------------------------------------------------------------------------------------------
Doc: indian_supreme_court_judgments.csv | sentence_index=23991 | paragraph_index=0 | token_start_pos=776718



