In [19]:
from __future__ import annotations
import json, random, os, hashlib, re
from pathlib import Path
from typing import Any, Dict, List, Tuple
from collections import Counter, defaultdict


In [20]:
# CHANGE THIS to your actual per-page section directory
SECTIONS_DIR = Path("/data/sundeep/Fandom_SI/data/interim/sections_parsed_money-heist_by_page").expanduser()

page_files = sorted(SECTIONS_DIR.glob("*.jsonl"))
len(page_files), page_files[:3]


(267,
 [PosixPath('/data/sundeep/Fandom_SI/data/interim/sections_parsed_money-heist_by_page/1026.jsonl'),
  PosixPath('/data/sundeep/Fandom_SI/data/interim/sections_parsed_money-heist_by_page/1031.jsonl'),
  PosixPath('/data/sundeep/Fandom_SI/data/interim/sections_parsed_money-heist_by_page/1053.jsonl')])

In [21]:
def load_page_sections_jsonl(page_jsonl: Path) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    with open(page_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

In [22]:
def build_original_like(
    section_rows: List[Dict[str, Any]],
    link_types_include: List[str] | None,
    section_sep: str = "\n\n",
    validate_anchor_text: bool = True,
) -> Tuple[str, List[Dict[str, Any]], Dict[str, Any]]:
    texts: List[str] = []
    spans: List[Dict[str, Any]] = []

    total_links = kept_links = bad_offsets = bad_anchor_mismatch = 0
    cursor = 0

    for rec in section_rows:
        sec_text = rec.get("text") or ""
        sec_links = rec.get("links") or []

        texts.append(sec_text)
        sec_base = cursor

        for link in sec_links:
            total_links += 1
            lt = link.get("link_type")

            if link_types_include is not None and lt not in link_types_include:
                continue

            s, e = link.get("start"), link.get("end")
            if s is None or e is None:
                bad_offsets += 1
                continue

            try:
                s, e = int(s), int(e)
            except Exception:
                bad_offsets += 1
                continue

            if s < 0 or e <= s or e > len(sec_text):
                bad_offsets += 1
                continue

            if validate_anchor_text:
                anchor = link.get("anchor_text") or ""
                if anchor and sec_text[s:e] != anchor:
                    bad_anchor_mismatch += 1

            kept_links += 1
            spans.append({"start": sec_base + s, "end": sec_base + e, "type": lt or "link"})

        cursor += len(sec_text)
        cursor += len(section_sep)  # <-- always adds, including last section

    page_text = section_sep.join(texts)

    clipped = [sp for sp in spans if 0 <= sp["start"] < sp["end"] <= len(page_text)]
    stats = dict(
        total_links=total_links,
        kept_links=kept_links,
        bad_offsets=bad_offsets,
        bad_anchor_mismatch=bad_anchor_mismatch,
        num_spans=len(clipped),
        num_sections=len(section_rows),
        text_len=len(page_text),
    )
    return page_text, clipped, stats


def build_fixed_cursor(
    section_rows: List[Dict[str, Any]],
    link_types_include: List[str] | None,
    section_sep: str = "\n\n",
    validate_anchor_text: bool = True,
) -> Tuple[str, List[Dict[str, Any]], Dict[str, Any]]:
    texts: List[str] = []
    spans: List[Dict[str, Any]] = []

    total_links = kept_links = bad_offsets = bad_anchor_mismatch = 0
    cursor = 0

    n = len(section_rows)
    for i, rec in enumerate(section_rows):
        sec_text = rec.get("text") or ""
        sec_links = rec.get("links") or []

        texts.append(sec_text)
        sec_base = cursor

        for link in sec_links:
            total_links += 1
            lt = link.get("link_type")

            if link_types_include is not None and lt not in link_types_include:
                continue

            s, e = link.get("start"), link.get("end")
            if s is None or e is None:
                bad_offsets += 1
                continue

            try:
                s, e = int(s), int(e)
            except Exception:
                bad_offsets += 1
                continue

            if s < 0 or e <= s or e > len(sec_text):
                bad_offsets += 1
                continue

            if validate_anchor_text:
                anchor = link.get("anchor_text") or ""
                if anchor and sec_text[s:e] != anchor:
                    bad_anchor_mismatch += 1

            kept_links += 1
            spans.append({"start": sec_base + s, "end": sec_base + e, "type": lt or "link"})

        cursor += len(sec_text)
        if i != n - 1:
            cursor += len(section_sep)  # only between sections

    page_text = section_sep.join(texts)

    clipped = [sp for sp in spans if 0 <= sp["start"] < sp["end"] <= len(page_text)]
    stats = dict(
        total_links=total_links,
        kept_links=kept_links,
        bad_offsets=bad_offsets,
        bad_anchor_mismatch=bad_anchor_mismatch,
        num_spans=len(clipped),
        num_sections=len(section_rows),
        text_len=len(page_text),
    )
    return page_text, clipped, stats


In [23]:
def compare_original_vs_fixed(page_file: Path, section_sep="\n\n", link_types_include=None, validate_anchor_text=True):
    rows = load_page_sections_jsonl(page_file)
    if not rows:
        return None

    t1, sp1, st1 = build_original_like(rows, link_types_include, section_sep, validate_anchor_text)
    t2, sp2, st2 = build_fixed_cursor(rows, link_types_include, section_sep, validate_anchor_text)

    # If cursor drift exists, original tends to clip spans more
    return {
        "page": page_file.name,
        "sections": st1["num_sections"],
        "text_len": st1["text_len"],
        "kept_links": st1["kept_links"],
        "orig_num_spans": st1["num_spans"],
        "fixed_num_spans": st2["num_spans"],
        "orig_clipped": st1["kept_links"] - st1["num_spans"],
        "fixed_clipped": st2["kept_links"] - st2["num_spans"],
        "anchor_mismatch": st1["bad_anchor_mismatch"],
        "bad_offsets": st1["bad_offsets"],
    }

sample = random.sample(page_files, min(20, len(page_files)))
rows = [compare_original_vs_fixed(p) for p in sample]
rows = [r for r in rows if r is not None]
rows[:5], len(rows)


([{'page': '2610.jsonl',
   'sections': 3,
   'text_len': 547,
   'kept_links': 12,
   'orig_num_spans': 12,
   'fixed_num_spans': 12,
   'orig_clipped': 0,
   'fixed_clipped': 0,
   'anchor_mismatch': 0,
   'bad_offsets': 0},
  {'page': '1453.jsonl',
   'sections': 1,
   'text_len': 183,
   'kept_links': 2,
   'orig_num_spans': 2,
   'fixed_num_spans': 2,
   'orig_clipped': 0,
   'fixed_clipped': 0,
   'anchor_mismatch': 0,
   'bad_offsets': 0},
  {'page': '2045.jsonl',
   'sections': 5,
   'text_len': 1020,
   'kept_links': 25,
   'orig_num_spans': 25,
   'fixed_num_spans': 25,
   'orig_clipped': 0,
   'fixed_clipped': 0,
   'anchor_mismatch': 0,
   'bad_offsets': 0},
  {'page': '2599.jsonl',
   'sections': 3,
   'text_len': 769,
   'kept_links': 22,
   'orig_num_spans': 22,
   'fixed_num_spans': 22,
   'orig_clipped': 0,
   'fixed_clipped': 0,
   'anchor_mismatch': 0,
   'bad_offsets': 0},
  {'page': '2609.jsonl',
   'sections': 3,
   'text_len': 530,
   'kept_links': 12,
   'orig_n

In [24]:
if not rows:
    print("No pages loaded.")
else:
    worse = sum(1 for r in rows if r["orig_num_spans"] < r["fixed_num_spans"])
    equal = sum(1 for r in rows if r["orig_num_spans"] == r["fixed_num_spans"])
    print("Pages where original < fixed (likely cursor/separator drift):", worse)
    print("Pages where equal:", equal)
    print("Max orig_clipped:", max(r["orig_clipped"] for r in rows))
    print("Max fixed_clipped:", max(r["fixed_clipped"] for r in rows))
    sorted_rows = sorted(rows, key=lambda r: (r["orig_clipped"] - r["fixed_clipped"]), reverse=True)
    print("\nTop suspicious pages:")
    for r in sorted_rows[:10]:
        print(r["page"], "orig_clipped=", r["orig_clipped"], "fixed_clipped=", r["fixed_clipped"], "kept_links=", r["kept_links"])


Pages where original < fixed (likely cursor/separator drift): 0
Pages where equal: 20
Max orig_clipped: 0
Max fixed_clipped: 0

Top suspicious pages:
2610.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 12
1453.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 2
2045.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 25
2599.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 22
2609.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 12
2448.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 14
1725.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 14
1914.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 12
384.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 52
1736.jsonl orig_clipped= 0 fixed_clipped= 0 kept_links= 14


In [25]:
def integrity_checks(page_text: str, spans: List[Dict[str, Any]]):
    issues = Counter()

    for sp in spans:
        s, e = sp["start"], sp["end"]
        if not isinstance(s, int) or not isinstance(e, int):
            issues["non_int_offsets"] += 1
            continue
        if s < 0 or e < 0:
            issues["negative_offsets"] += 1
        if e <= s:
            issues["non_positive_length"] += 1
        if e > len(page_text):
            issues["end_out_of_bounds"] += 1
        if s >= len(page_text):
            issues["start_out_of_bounds"] += 1
        if 0 <= s < e <= len(page_text):
            if page_text[s:e] == "":
                issues["empty_substring"] += 1
            if page_text[s:e].isspace():
                issues["all_whitespace_span"] += 1

    return issues

# Run on a few pages using FIXED cursor (recommended ground truth)
sample2 = random.sample(page_files, min(10, len(page_files)))
for p in sample2:
    rows_ = load_page_sections_jsonl(p)
    t, sp, st = build_fixed_cursor(rows_, link_types_include=None, section_sep="\n\n", validate_anchor_text=True)
    issues = integrity_checks(t, sp)
    print(p.name, "spans=", len(sp), "issues=", dict(issues))


1737.jsonl spans= 25 issues= {}
2566.jsonl spans= 19 issues= {}
348.jsonl spans= 70 issues= {}
349.jsonl spans= 66 issues= {}
1699.jsonl spans= 12 issues= {}
840.jsonl spans= 31 issues= {}
473.jsonl spans= 14 issues= {}
493.jsonl spans= 30 issues= {}
2451.jsonl spans= 17 issues= {}
417.jsonl spans= 102 issues= {}


In [28]:
from collections import Counter
import random

sample = random.sample(page_files, min(200, len(page_files)))
rows = [compare_original_vs_fixed(p) for p in sample]
rows = [r for r in rows if r is not None]

worse = [r for r in rows if r["orig_num_spans"] < r["fixed_num_spans"]]
print("Sample size:", len(rows))
print("Pages where original < fixed:", len(worse))
if worse:
    print("Example suspicious page:", worse[0])


Sample size: 200
Pages where original < fixed: 0


In [29]:
import random

def spotcheck_span_substrings(page_file, k=20, section_sep="\n\n"):
    rows = load_page_sections_jsonl(page_file)
    text, spans, st = build_fixed_cursor(rows, None, section_sep, validate_anchor_text=True)

    if not spans:
        print(page_file.name, "NO SPANS")
        return

    picks = random.sample(spans, min(k, len(spans)))
    print("Page:", page_file.name, "| spans:", len(spans), "| text_len:", len(text))
    for sp in picks:
        s, e = sp["start"], sp["end"]
        frag = text[s:e]
        print(sp["type"], s, e, "=>", repr(frag))

spotcheck_span_substrings(random.choice(page_files), k=20)


Page: 2313.jsonl | spans: 17 | text_len: 524
internal 368 375 => 'Tatiana'
internal 95 115 => 'Your Place in Heaven'
internal 327 333 => 'Berlin'
internal 471 480 => 'Marseille'
internal 504 524 => 'Your Place in Heaven'
internal 495 502 => 'Tatiana'
internal 488 494 => 'Rafael'
internal 304 324 => 'Your Place in Heaven'
internal 254 260 => 'Rafael'
internal 229 235 => 'Berlin'
internal 240 247 => 'Tatiana'
internal 481 487 => 'Bogotá'
internal 336 345 => 'Marseille'
internal 116 136 => 'Your Place in Heaven'
internal 348 354 => 'Bogotá'
internal 464 470 => 'Berlin'
internal 357 363 => 'Rafael'


In [26]:
def find_anchor_mismatches(page_file: Path, limit=10, section_sep="\n\n"):
    rows = load_page_sections_jsonl(page_file)
    if not rows:
        return

    mismatches = []
    for rec in rows:
        sec_text = rec.get("text") or ""
        for link in (rec.get("links") or []):
            s, e = link.get("start"), link.get("end")
            anchor = link.get("anchor_text") or ""
            if not anchor or s is None or e is None:
                continue
            try:
                s, e = int(s), int(e)
            except:
                continue
            if 0 <= s < e <= len(sec_text):
                if sec_text[s:e] != anchor:
                    mismatches.append((anchor, sec_text[s:e], link.get("link_type")))
                    if len(mismatches) >= limit:
                        break
        if len(mismatches) >= limit:
            break

    print("Page:", page_file.name, "mismatches_shown:", len(mismatches))
    for a, sub, lt in mismatches:
        print("type=", lt, "\n  anchor_text:", repr(a), "\n  substring  :", repr(sub), "\n")

# Try a random page; or replace with a known suspicious one from Cell 6
p = random.choice(page_files)
find_anchor_mismatches(p, limit=10)


Page: 2608.jsonl mismatches_shown: 0


In [27]:
def split_bucket_builtin_hash(name: str) -> str:
    h = abs(hash(name)) % 100
    if h < 80: return "train"
    if h < 90: return "dev"
    return "test"

def split_bucket_stable_md5(name: str) -> str:
    h = int(hashlib.md5(name.encode("utf-8")).hexdigest(), 16) % 100
    if h < 80: return "train"
    if h < 90: return "dev"
    return "test"

keys = [p.stem for p in random.sample(page_files, min(200, len(page_files)))]
builtin = Counter(split_bucket_builtin_hash(k) for k in keys)
stable = Counter(split_bucket_stable_md5(k) for k in keys)

print("Builtin hash split distribution:", builtin)
print("Stable md5 split distribution  :", stable)
print("\nNOTE: builtin hash can change between processes unless PYTHONHASHSEED is fixed.")


Builtin hash split distribution: Counter({'train': 162, 'test': 20, 'dev': 18})
Stable md5 split distribution  : Counter({'train': 147, 'test': 30, 'dev': 23})

NOTE: builtin hash can change between processes unless PYTHONHASHSEED is fixed.


In [30]:
import random, re
from collections import Counter

def is_suspicious_substring(s: str) -> bool:
    # Heuristics for “misaligned but in-bounds”
    if not s or s.isspace():
        return True
    if "\n\n" in s:  # section separators leaking into span
        return True
    if len(s) > 200:  # extremely long anchor is suspicious
        return True
    # weird: begins/ends with whitespace or punctuation-heavy
    if s[0].isspace() or s[-1].isspace():
        return True
    if sum(ch.isalnum() for ch in s) / max(1, len(s)) < 0.4:
        return True
    return False

def check_pages(page_files, n_pages=300, section_sep="\n\n"):
    stats = Counter()
    suspicious_examples = []

    sample = random.sample(page_files, min(n_pages, len(page_files)))
    for p in sample:
        rows = load_page_sections_jsonl(p)
        text, spans, st = build_fixed_cursor(rows, None, section_sep, validate_anchor_text=True)

        # core invariants
        for sp in spans:
            s, e = sp["start"], sp["end"]
            if not (isinstance(s, int) and isinstance(e, int)):
                stats["non_int"] += 1
                continue
            if not (0 <= s < e <= len(text)):
                stats["out_of_bounds"] += 1
                continue

            frag = text[s:e]
            if is_suspicious_substring(frag):
                stats["suspicious_substring"] += 1
                if len(suspicious_examples) < 20:
                    suspicious_examples.append((p.name, sp, frag))

        # page-level checks
        if st["kept_links"] and st["num_spans"] < st["kept_links"]:
            stats["pages_with_clipping"] += 1

    return stats, suspicious_examples

stats, examples = check_pages(page_files, n_pages=300)
print("Stats:", dict(stats))
print("\nSuspicious examples (up to 20):")
for ex in examples:
    print(ex[0], ex[1], repr(ex[2]))


Stats: {'suspicious_substring': 624}

Suspicious examples (up to 20):
321.jsonl {'start': 2441, 'end': 2446, 'type': 'other'} '[ 1 ]'
321.jsonl {'start': 2751, 'end': 2756, 'type': 'other'} '[ 1 ]'
321.jsonl {'start': 2802, 'end': 2807, 'type': 'other'} '[ 1 ]'
2449.jsonl {'start': 1578, 'end': 1583, 'type': 'other'} '[ 1 ]'
2449.jsonl {'start': 1968, 'end': 1969, 'type': 'other'} '↑'
86.jsonl {'start': 220, 'end': 225, 'type': 'other'} '[ 1 ]'
86.jsonl {'start': 1023, 'end': 1028, 'type': 'other'} '[ 2 ]'
86.jsonl {'start': 1055, 'end': 1060, 'type': 'other'} '[ 3 ]'
86.jsonl {'start': 2307, 'end': 2312, 'type': 'other'} '[ 4 ]'
86.jsonl {'start': 8979, 'end': 8980, 'type': 'other'} '↑'
86.jsonl {'start': 9014, 'end': 9015, 'type': 'other'} '↑'
86.jsonl {'start': 9031, 'end': 9032, 'type': 'other'} '↑'
86.jsonl {'start': 9048, 'end': 9049, 'type': 'other'} '↑'
707.jsonl {'start': 368, 'end': 373, 'type': 'other'} '[ 1 ]'
707.jsonl {'start': 405, 'end': 410, 'type': 'other'} '[ 2 ]'
70