In [3]:
# Run this in Jupyter or as a .py script
import re, json, os
from pathlib import Path

# Paths (adjust if needed)
data_dir = Path("./dataset_out_simple")   # change to your dataset folder
train_in = data_dir / "train_pairs.jsonl"
passages_in = data_dir / "passages.jsonl"
train_out = data_dir / "train_pairs_clean.jsonl"
passages_out = data_dir / "passages_clean.jsonl"

# Safety checks
if not train_in.exists():
    raise FileNotFoundError(f"{train_in} not found. Adjust path to your train_pairs.jsonl")

# Regex strategy:
# 1) Remove verse markers at start of line like: "1.001 " or "1:001 " or ".033 "
#    pattern_start handles ^ or after newline
# 2) Remove inline markers like " .033 " or ".033" or " 1.001 "
#    pattern_inline finds markers preceded and/or followed by whitespace or punctuation
# 3) Collapse multiple spaces, fix space before punctuation.

# Compile regexes
# verse-like: optional leading dot, 1-3 digits, optional separator (.|:) and optional 1-3 digits
pattern_start = re.compile(r'(?m)^\s*(?:\.\d{1,3}|\d{1,3}[.:]\d{1,3}|\d{1,3})\s*') 
# inline: preceded by whitespace or start, followed by whitespace or punctuation
pattern_inline = re.compile(r'(?<=\s)(?:\.\d{1,3}|\d{1,3}[.:]\d{1,3}|\d{1,3})(?=(?:\s|[,\.\;\:\?\!]))')
# also catch dot-number directly after punctuation or at sentence boundaries like ".033" not preceded by whitespace
pattern_dotnum = re.compile(r'(?<=\.)\d{1,3}(?=(?:\s|[,\.\;\:\?\!]))')  # handles ".033 " -> remove "033"
# fallback: any isolated token that is only small number tokens separated by dots (e.g., ".049")
pattern_general = re.compile(r'(?<!\S)(?:\.\d{1,3}|\d{1,3}[.:]\d{1,3}|\d{1,3})(?!\S)')

def clean_text(s: str) -> str:
    if not s:
        return s
    # 1) normalize newlines to single space to avoid start-of-line quirks (we'll still handle ^ lines)
    # but we want to keep sentence separation, so keep newlines but perform start-of-line removal first
    # remove markers at start of lines
    s = pattern_start.sub("", s)
    # remove inline markers that are followed by punctuation/space
    s = pattern_inline.sub("", s)
    # remove digits that follow a dot like ".033" -> remove digits (dot remains)
    s = pattern_dotnum.sub("", s)
    # remove any isolated verse-token leftover
    s = pattern_general.sub("", s)
    # remove sequences like " .  " or ".  " leftover
    s = re.sub(r'\s*\.\s*', '. ', s)   # normalize stray dots spacing
    # collapse multiple spaces and normalize spaces before punctuation
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'\s+([,.;:?!])', r'\1', s)  # remove space before punctuation
    # fix " .," odd combos
    s = re.sub(r'\.([A-Za-z])', r'. \1', s)  # ensure period + letter has space
    return s

def process_jsonl(in_path: Path, out_path: Path, fields_to_clean):
    count = 0
    with in_path.open('r', encoding='utf8') as fin, out_path.open('w', encoding='utf8') as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            for fld in fields_to_clean:
                if fld in obj and obj[fld]:
                    if isinstance(obj[fld], str):
                        obj[fld] = clean_text(obj[fld])
                    elif isinstance(obj[fld], list):
                        obj[fld] = [clean_text(x) if isinstance(x, str) else x for x in obj[fld]]
            fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
            count += 1
    return count

# Clean train_pairs: fields anchor, positive, negatives (list)
print("Cleaning train pairs...")
n_pairs = process_jsonl(train_in, train_out, fields_to_clean=["anchor","positive","negatives"])
print(f"Saved cleaned train pairs -> {train_out} ({n_pairs} lines)")

# Clean passages if present
if passages_in.exists():
    print("Cleaning passages...")
    # passages.jsonl probably has {"text": "...", ...}
    def process_passages(in_path, out_path):
        c = 0
        with in_path.open('r', encoding='utf8') as fin, out_path.open('w', encoding='utf8') as fout:
            for line in fin:
                line = line.strip()
                if not line:
                    continue
                obj = json.loads(line)
                if 'text' in obj and obj['text']:
                    obj['text'] = clean_text(obj['text'])
                fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
                c += 1
        return c
    n_pass = process_passages(passages_in, passages_out)
    print(f"Saved cleaned passages -> {passages_out} ({n_pass} lines)")
else:
    print("No passages.jsonl found; skipping passages clean.")

# Quick verification: print first 3 cleaned pairs
print("\nPreview cleaned pairs (first 3):")
with open(train_out, 'r', encoding='utf8') as fh:
    for i, line in enumerate(fh):
        if i >= 3: break
        d = json.loads(line)
        print("--- PAIR", i+1)
        print("ANCHOR:", d.get("anchor","")[:250].replace("\n"," | "))
        print("POS:", d.get("positive","")[:250].replace("\n"," | "))
        print("NEG_COUNT:", len(d.get("negatives",[])))
        print()


Cleaning train pairs...
Saved cleaned train pairs -> dataset_out_simple/train_pairs_clean.jsonl (29489 lines)
Cleaning passages...
Saved cleaned passages -> dataset_out_simple/passages_clean.jsonl (11829 lines)

Preview cleaned pairs (first 3):
--- PAIR 1
ANCHOR: . Being greatly pierced by (the arrows of the) god of love and being excited by him, she withdrew her mind from any other object; and with her imagination highly inflamed, she mentally sported with him (Arjuna) on a wide and excellent bed laid over w
POS: . Being greatly pierced by (the arrows of the) god of love and being excited by him, she withdrew her mind from any other object; and with her imagination highly inflamed, she mentally sported with him (Arjuna) on a wide and excellent bed laid over w
NEG_COUNT: 2

--- PAIR 2
ANCHOR: . All those foremost of Kshatriyas with Vasudeva (Krishna) at their head sat around Dharmaraja Yudhishthira. Saluting that chief of the Kurus, Keshava (Krishna) spoke thus in sorrow. Krishna said:

In [4]:
d

{'anchor': '. Here the greatly effulgent lord of spirits (Shiva), the eternal lord of all creatures lived after creating all the worlds and here he was worshipped by all the sprites. Here Nara and Narayana, Brahma, Yama and the fifth Sthanu performed sacrifices after the expiration of one thousand Yugas. Here for the establishment of virtue and religion Vasudeva with with pious devotion performed his sacrifices with extended for many long years. Where Keshava placed thousands and tens of thousands of sacrificial stakes adorned with garlands of gold and innumerable (sacrificial) altars of great splendour. O descendant of Bharata, going there he (Maya) brought the club, the conch shall and the various articles of crystal which belonged to Vrishaparva, (the Danava king). Going there thereat Asura, Maya, took all the great wealth which was guarded by the Takshas and Rakshasas and Kinkaras. Bringing them away, the Asura (Maya) built with them a matchless assembly-hall. It was celebrated thr