In [1]:
"""
create_dataset_mahabharata.py

This script only creates the dataset needed for contrastive training.
It reads Purva files (JSONL with verse_id and text), builds multi-scale
sliding windows, and outputs two files:
  - passages.jsonl : each line is a JSON with passage metadata and text
  - train_pairs.jsonl : each line is a JSON {"anchor":..., "positive":..., "meta":...}

Usage:
  python create_dataset_mahabharata.py --purva_dir ./purvas --out_dir ./dataset_out \
        --window_sizes 16 32 48 --overlaps 8 16 24 --max_pairs 50000

Requirements:
  pip install tqdm

"""

import argparse
import json
import os
from pathlib import Path
from tqdm import tqdm
import random


def load_purvas(purva_dir):
    purvas = {}
    for f in sorted(Path(purva_dir).glob("*.jsonl")):
        purva_name = f.stem
        verses = []
        with f.open(encoding="utf8") as fh:
            for line in fh:
                line = line.strip()
                if not line:
                    continue
                try:
                    v = json.loads(line)
                except Exception:
                    parts = line.split("	", 1)
                    v = {"verse_id": parts[0].strip(), "text": parts[1].strip() if len(parts) > 1 else ""}
                verses.append(v)
        purvas[purva_name] = verses
    return purvas


def build_windows(verses, window_size, overlap):
    stride = max(1, window_size - overlap)
    windows = []
    n = len(verses)
    i = 0
    while i < n:
        start = i
        end = min(i + window_size, n)
        block = verses[start:end]
        text = "".join([f"{v['verse_id']} {v['text']}" for v in block])
        windows.append({
            "start_idx": start,
            "end_idx": end - 1,
            "start_verse": block[0]["verse_id"],
            "end_verse": block[-1]["verse_id"],
            "text": text,
            "verse_count": len(block)
        })
        if end == n:
            break
        i += stride
    return windows


def create_positive_pairs_for_purva(windows_by_size, max_shifts=2):
    pairs = []
    sizes = sorted(windows_by_size.keys())
    for W in sizes:
        windows = windows_by_size[W]
        for i in range(len(windows)):
            a = windows[i]["text"]
            # adjacent
            if i + 1 < len(windows):
                pairs.append((a, windows[i + 1]["text"], {"size": W, "type": "adjacent", "idx": i}))
            # shifts
            for s in range(1, max_shifts + 1):
                if i + s + 1 < len(windows):
                    pairs.append((a, windows[i + s + 1]["text"], {"size": W, "type": f"shift{s+1}", "idx": i}))
    # multi-scale: pair smaller windows to larger windows that overlap
    for i, W in enumerate(sizes):
        for L in sizes[i+1:]:
            small_ws = windows_by_size[W]
            large_ws = windows_by_size[L]
            # naive overlap: map by proportional index
            for idx_s, sw in enumerate(small_ws):
                j = min(len(large_ws)-1, int(idx_s * (len(large_ws) / max(1, len(small_ws)))))
                pairs.append((sw["text"], large_ws[j]["text"], {"size": W, "type": "multiscale", "larger": L, "idx": idx_s}))
    return pairs


def build_dataset(purvas, window_sizes, overlaps, max_pairs=None, shuffle=True):
    all_passages = []
    train_pairs = []
    purva_meta = {}

    for purva_name, verses in purvas.items():
        windows_by_size = {}
        for W, O in zip(window_sizes, overlaps):
            windows_by_size[W] = build_windows(verses, W, O)
        # flatten windows into all_passages and keep mapping
        start_idx = len(all_passages)
        for W in window_sizes:
            for w in windows_by_size[W]:
                w_meta = dict(w)
                w_meta.update({"purva": purva_name, "window_size": W})
                all_passages.append(w_meta)
        end_idx = len(all_passages) - 1
        purva_meta[purva_name] = {"start_idx": start_idx, "end_idx": end_idx, "counts": {W: len(windows_by_size[W]) for W in window_sizes}}

        # create positive pairs within this purva
        pairs = create_positive_pairs_for_purva(windows_by_size)
        # attach purva name in meta
        for a, b, m in pairs:
            m = dict(m)
            m.update({"purva": purva_name})
            train_pairs.append({"anchor": a, "positive": b, "meta": m})

    if shuffle:
        random.shuffle(train_pairs)
    if max_pairs and max_pairs < len(train_pairs):
        train_pairs = train_pairs[:max_pairs]
    return all_passages, train_pairs, purva_meta


def save_jsonl(items, out_path):
    with open(out_path, 'w', encoding='utf8') as fh:
        for it in items:
            fh.write(json.dumps(it, ensure_ascii=False) + '')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--purva_dir', type=str, default='./purvas', help='Directory of purva jsonl files')
    parser.add_argument('--out_dir', type=str, default='./dataset_out', help='Where to save passages.jsonl and train_pairs.jsonl')
    parser.add_argument('--window_sizes', type=int, nargs='+', default=[16,32,48])
    parser.add_argument('--overlaps', type=int, nargs='+', default=[8,16,24])
    parser.add_argument('--max_pairs', type=int, default=None)
    parser.add_argument('--seed', type=int, default=42)
    args = parser.parse_args()

    random.seed(args.seed)
    os.makedirs(args.out_dir, exist_ok=True)

    purvas = load_purvas(args.purva_dir)
    print(f"Loaded purvas: {list(purvas.keys())}")

    passages, train_pairs, purva_meta = build_dataset(purvas, args.window_sizes, args.overlaps, max_pairs=args.max_pairs)

    print(f"Built {len(passages)} passages and {len(train_pairs)} train pairs")

    # Save passages (with metadata) and train pairs
    save_jsonl(passages, os.path.join(args.out_dir, 'passages.jsonl'))
    save_jsonl(train_pairs, os.path.join(args.out_dir, 'train_pairs.jsonl'))

    # Save purva meta
    with open(os.path.join(args.out_dir, 'purva_meta.json'), 'w', encoding='utf8') as fh:
        json.dump(purva_meta, fh, ensure_ascii=False, indent=2)

    print('Saved dataset files in', args.out_dir)


[1;34musage: [0m[1;35mipykernel_launcher.py[0m [[32m-h[0m] [[36m--purva_dir [33mPURVA_DIR[0m] [[36m--out_dir [33mOUT_DIR[0m]
                             [[36m--window_sizes [33mWINDOW_SIZES [WINDOW_SIZES ...][0m]
                             [[36m--overlaps [33mOVERLAPS [OVERLAPS ...][0m]
                             [[36m--max_pairs [33mMAX_PAIRS[0m] [[36m--seed [33mSEED[0m]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/roopeshmangal/Library/Jupyter/runtime/kernel-v39c55b277988392caeabe0b098721424ff50f1676.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


### generate positve , anchor,nagtive

In [None]:
# %%
"""
Simple dataset builder for contrastive training (no ids â€” raw texts only).

Outputs (out_dir):
 - passages.jsonl       : all passage windows (with text + minimal meta)
 - train_pairs.jsonl    : each line {"anchor":..., "positive":..., "negatives":[...], "meta":...}
 - preview_pairs.jsonl  : first 20 pairs (same format) for inspection

Usage:
 - Put your chapter dictionary into the variable `chapters_dict` below (chapter_id -> list of verses).
 - Run all cells.
"""
# %% imports
import json
import os
import random
from pathlib import Path
from typing import List, Dict, Any

# %% utils
def ensure_dir(d: str):
    Path(d).mkdir(parents=True, exist_ok=True)

def save_jsonl(items: List[Dict[str, Any]], path: str):
    with open(path, 'w', encoding='utf8') as fh:
        for it in items:
            fh.write(json.dumps(it, ensure_ascii=False) + '\n')

# %% normalize input chapters
def normalize_chapters_from_dict(chapters_raw: Dict[str, List[Any]]) -> Dict[str, List[Dict[str,str]]]:
    """
    Accepts chapter_id -> list of verses (strings or dicts).
    Returns chapter_id -> list of {"verse_id","text"}.
    """
    chapters = {}
    for ch_id, verses in chapters_raw.items():
        parsed = []
        for i, v in enumerate(verses):
            if isinstance(v, str):
                verse_id = f"{ch_id}.{i+1:03d}"
                parsed.append({"verse_id": verse_id, "text": v})
            elif isinstance(v, dict):
                vid = v.get("verse_id") or f"{ch_id}.{i+1:03d}"
                txt = v.get("text") or v.get("verse") or ""
                parsed.append({"verse_id": vid, "text": txt})
            else:
                raise ValueError(f"Unsupported verse format in chapter {ch_id}: {v}")
        chapters[ch_id] = parsed
    return chapters

# %% build windows (within chapter)
def build_windows_for_chapter(verses: List[Dict[str, str]], window_size: int, overlap: int) -> List[Dict[str, Any]]:
    stride = max(1, window_size - overlap)
    windows = []
    n = len(verses)
    i = 0
    while i < n:
        start = i
        end = min(i + window_size, n)
        block = verses[start:end]
        text = "\n".join([f"{v['verse_id']} {v['text']}" for v in block])
        windows.append({
            "chapter": None,          # filled later
            "start_idx": start,
            "end_idx": end - 1,
            "start_verse": block[0]["verse_id"],
            "end_verse": block[-1]["verse_id"],
            "text": text,
            "verse_count": len(block)
        })
        if end == n:
            break
        i += stride
    return windows

# %% create pairs (anchor/positive by text)
def create_positive_pairs_text(windows_by_size: Dict[int, List[Dict[str, Any]]], max_shifts: int = 2) -> List[Dict[str, Any]]:
    pairs = []
    sizes = sorted(windows_by_size.keys())
    for W in sizes:
        windows = windows_by_size[W]
        for i in range(len(windows)):
            a_text = windows[i]["text"]
            # adjacent
            if i + 1 < len(windows):
                pairs.append({"anchor": a_text, "positive": windows[i + 1]["text"], "meta": {"size": W, "type": "adjacent", "idx": i}})
            # shifts
            for s in range(1, max_shifts + 1):
                if i + s + 1 < len(windows):
                    pairs.append({"anchor": a_text, "positive": windows[i + s + 1]["text"], "meta": {"size": W, "type": f"shift{s+1}", "idx": i}})
    # multi-scale: small -> larger overlapping (by text)
    for i, W in enumerate(sizes):
        for L in sizes[i+1:]:
            small_ws = windows_by_size[W]
            large_ws = windows_by_size[L]
            if not small_ws or not large_ws:
                continue
            for idx_s, sw in enumerate(small_ws):
                j = min(len(large_ws)-1, int(idx_s * (len(large_ws) / max(1, len(small_ws)))))
                pairs.append({"anchor": sw["text"], "positive": large_ws[j]["text"], "meta": {"size": W, "type": "multiscale", "larger": L, "idx": idx_s}})
    return pairs

# %% main builder (simplified: stores full texts)
def build_simple_dataset(
    chapters_raw: Dict[str, List[Any]],
    window_sizes: List[int] = [16, 32, 40],
    overlaps: List[int] = [8, 16, 20],
    max_pairs: int = None,
    neg_per_anchor: int = 2,
    neg_strategy: str = 'other_chapter',   # 'other_chapter' or 'global_random'
    seed: int = 42
):
    """
    Returns (passages, train_pairs_text)
    - passages: list of passages (with chapter + text)
    - train_pairs_text: list of dicts {'anchor': text, 'positive': text, 'negatives': [texts], 'meta': ...}
    """
    random.seed(seed)
    chapters = normalize_chapters_from_dict(chapters_raw)
    passages = []
    chapter_to_passages = {}

    # build windows per chapter and register passage texts
    for ch_id, verses in chapters.items():
        windows_by_size = {}
        for W, O in zip(window_sizes, overlaps):
            windows = build_windows_for_chapter(verses, W, O)
            # tag chapter on windows
            for w in windows:
                w["chapter"] = ch_id
            windows_by_size[W] = windows

        # flatten and collect passage texts
        chapter_pass_texts = []
        for W in window_sizes:
            for w in windows_by_size[W]:
                passages.append({"chapter": ch_id, "window_size": W, "start_idx": w["start_idx"], "end_idx": w["end_idx"], "verse_count": w["verse_count"], "text": w["text"]})
                chapter_pass_texts.append(w["text"])
        chapter_to_passages[ch_id] = chapter_pass_texts

    # create positive pairs (within-chapter)
    train_pairs = []
    for ch_id, verses in chapters.items():
        # rebuild windows_by_size for this chapter only to create positive pairs (ensures adjacency inside chapter)
        windows_by_size = {}
        for W, O in zip(window_sizes, overlaps):
            windows = build_windows_for_chapter(chapters[ch_id], W, O)
            windows_by_size[W] = windows
        pairs = create_positive_pairs_text(windows_by_size)
        # attach chapter meta (optional)
        for p in pairs:
            p["meta"]["chapter"] = ch_id
        train_pairs.extend(pairs)

    # shuffle & cap
    random.shuffle(train_pairs)
    if max_pairs and max_pairs < len(train_pairs):
        train_pairs = train_pairs[:max_pairs]

    # prepare negative sampling pool (texts)
    all_texts = [p["text"] for p in passages]
    # chapter -> texts already in chapter_to_passages

    def sample_neg_texts(anchor_ch, exclude_texts, n, strategy):
        pool = []
        if strategy == 'other_chapter':
            other_ch = [c for c in chapter_to_passages.keys() if c != anchor_ch]
            for c in other_ch:
                pool.extend(chapter_to_passages[c])
        else:
            pool = all_texts
        # exclude anchor/positive
        candidates = [t for t in pool if t not in exclude_texts]
        if not candidates:
            return []
        sampled = []
        tries = 0
        while len(sampled) < n and tries < n * 10:
            choice = random.choice(candidates)
            if choice not in sampled:
                sampled.append(choice)
            tries += 1
        return sampled

    # attach negatives (as full texts) to each pair
    train_pairs_text = []
    for tp in train_pairs:
        anchor_text = tp["anchor"]
        positive_text = tp["positive"]
        anchor_ch = tp["meta"].get("chapter")  # chapter where pair came from
        negs = sample_neg_texts(anchor_ch, [anchor_text, positive_text], neg_per_anchor, neg_strategy)
        train_pairs_text.append({"anchor": anchor_text, "positive": positive_text, "negatives": negs, "meta": tp["meta"]})

    return passages, train_pairs_text

# %% ===== USER: provide your chapters_dict here (replace the mock) =====
# Replace this small mock with your real in-memory dict of 1700 chapters.
chapters_dict = {
    "chap_001": [
        "Vyasa said: In the assembly the king asked for recitation of the lineage.",
        "Sanjaya described the kings of the Bharatas and their deeds.",
        "The sages listened and noted the names of heroic warriors.",
        "Yudhisthira was praised for his righteousness.",
        "Arjuna swore to fight for dharma."
    ],
    "chap_002": [
        "Duryodhana plotted in secret, gathering his allies by the river.",
        "Karna pledged his support and promised weapons and chariots.",
        "A messenger brought news of an alliance to Hastinapura.",
        "Draupadi learned of the plot and stood in quiet resolve."
    ],
    # ... replace with your full chapters dictionary ...
}

# %% Build the simple dataset (change params if you want)
out_dir = "./dataset_out_simple"
ensure_dir(out_dir)

window_sizes = [16, 32, 40]   # recommended: 40 max (fits your ~33 avg)
overlaps = [8, 16, 20]
max_pairs = 50000
neg_per_anchor = 2
neg_strategy = 'other_chapter'
seed = 42

passages, train_pairs_text = build_simple_dataset(
    chapters,
    window_sizes=window_sizes,
    overlaps=overlaps,
    max_pairs=max_pairs,
    neg_per_anchor=neg_per_anchor,
    neg_strategy=neg_strategy,
    seed=seed
)

print(f"Built {len(passages)} passages and {len(train_pairs_text)} pairs (each with {neg_per_anchor} negatives)")

# %% Save simple outputs (full texts)
save_jsonl(passages, os.path.join(out_dir, "passages.jsonl"))
save_jsonl(train_pairs_text, os.path.join(out_dir, "train_pairs.jsonl"))

# preview (first 20)
preview = train_pairs_text[:20]
save_jsonl(preview, os.path.join(out_dir, "preview_pairs.jsonl"))

print("Saved files to", out_dir)

# %% print compact preview
for i, p in enumerate(preview):
    print(f"--- PAIR {i+1} ---")
    print("Anchor (excerpt):", p['anchor'][:300].replace('\n',' | '))
    print("Positive (excerpt):", p['positive'][:300].replace('\n',' | '))
    print("Negatives count:", len(p['negatives']))
    print()
