## **04_ Generating SFT Dataset**

#### **Setup**

In [19]:
import sys, os, json
from pathlib import Path
import requests

CWD = Path.cwd().resolve()
ROOT = CWD if (CWD / "src").exists() else CWD.parent
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

DATA = ROOT / "data"
CHUNKS = DATA / "chunks" / "chunks.jsonl"
SFT_DIR = DATA / "sft"

print("ROOT:", ROOT)
print("CHUNKS exists?", CHUNKS.exists())
print("SFT_DIR:", SFT_DIR)

ROOT: D:\IIT BBS\Job Resources\Business Optima\pdf-agent
CHUNKS exists? True
SFT_DIR: D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft


In [2]:
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434")
print("OLLAMA_HOST:", OLLAMA_HOST)

def list_models():
    r = requests.get(f"{OLLAMA_HOST.rstrip('/')}/api/tags", timeout=10)
    r.raise_for_status()
    return [m.get("name") for m in r.json().get("models", [])]

print("Installed (ollama /api/tags):", list_models())

OLLAMA_HOST: http://127.0.0.1:11434
Installed (ollama /api/tags): ['llama3.1:8b-instruct-q8_0', 'mistral:instruct', 'llama3:instruct', 'llama3.2:latest']


In [3]:
from src.ingest.make_qa_and_summaries import make_data

MODEL = os.environ.get("OLLAMA_MODEL", "llama3.1:8b-instruct-q8_0")

summaries_path, qa_path = make_data(
    chunks_path=CHUNKS,
    out_dir=SFT_DIR,
    model=MODEL,
    seed=7,
    max_sections=40,            # started with 4 for dev
    min_tokens_per_section=600,
    qa_per_section=3,
    max_chars_summary=4000,    # BIGGER window → fewer map calls per long section
    max_chars_qa=1400,         # tighter QA context → faster & cleaner JSON
    timeout_s=900,             # generous
)
summaries_path, qa_path

[1/40] Section: Copyright Law United States Copyri > (E) Musical works database .- | chars=84013 | pages=(98, 134)
[2/40] Section: Copyright Law United States Copyri > Chapter 1 · Notes | chars=76008 | pages=(166, 185)
[3/40] Section: Copyright Law United States Copyri > § 114 · Scope of exclusive rights in sound  | chars=56474 | pages=(67, 88)
[4/40] Section: Copyright Law United States Copyri > § 1506 · Conduct of proceedings | chars=35880 | pages=(9, 378)
[5/40] Section: Copyright Law United States Copyri > § 111 · Limitations on exclusive rights: Se | chars=35626 | pages=(9, 62)
[6/40] Section: Copyright Law United States Copyri > § 512 · Limitations on liability relating t | chars=31291 | pages=(227, 236)
[7/40] Section: Copyright Law United States Copyri > § 1401 · Unauthorized use of pre-1972 sound | chars=29416 | pages=(36, 355)
[8/40] Section: Copyright Law United States Copyri > Statutory Enactments Contained in Title 17  | chars=24276 | pages=(10, 17)
[9/40] Section: Copyrig

(WindowsPath('D:/IIT BBS/Job Resources/Business Optima/pdf-agent/data/sft/summaries.jsonl'),
 WindowsPath('D:/IIT BBS/Job Resources/Business Optima/pdf-agent/data/sft/qa.jsonl'))

In [5]:
import json, itertools

print("Summaries sample:")
for line in itertools.islice(open(summaries_path, "r", encoding="utf-8"), 2):
    print(json.loads(line))

print("\nQ/A sample:")
for line in itertools.islice(open(qa_path, "r", encoding="utf-8"), 3):
    print(json.loads(line))


Summaries sample:
{'section': 'Copyright Law United States Copyri > (E) Musical works database .-', 'pages': [98, 134], 'short': '{ "short": "The mechanical licensing collective must establish and maintain a database containing information about musical works, shares of such works, copyright owners, and sound recordings. The collective must also provide identifying information for each sound recording and report usage data for musical works used', 'medium': '{ "short": "The mechanical licensing collective must establish and maintain a database containing information about musical works, shares of such works, copyright owners, and sound recordings. The collective must also provide identifying information for each sound recording and report usage data for musical works used under the blanket license.", "medium": "The mechanical licensing collective is responsible for establishing and maintaining a comprehensive database of musical works, including information on shares, copyright owners,

In [6]:
from src.train.sft_build import build_sft

train_p, dev_p, test_p = build_sft(
    summaries_path=summaries_path,
    qa_path=qa_path,
    out_dir=SFT_DIR,
    train_frac=0.85, dev_frac=0.075, test_frac=0.075
)
train_p, dev_p, test_p

[OK] SFT examples: total=160 | train=136 dev=12 test=12


(WindowsPath('D:/IIT BBS/Job Resources/Business Optima/pdf-agent/data/sft/train.jsonl'),
 WindowsPath('D:/IIT BBS/Job Resources/Business Optima/pdf-agent/data/sft/dev.jsonl'),
 WindowsPath('D:/IIT BBS/Job Resources/Business Optima/pdf-agent/data/sft/test.jsonl'))

In [7]:
import json

def count_jsonl(p): 
    return sum(1 for _ in open(p, "r", encoding="utf-8"))

print("Counts:",
      "train", count_jsonl(train_p),
      "dev", count_jsonl(dev_p),
      "test", count_jsonl(test_p))

ex = json.loads(next(open(train_p, "r", encoding="utf-8")))
ex

Counts: train 136 dev 12 test 12


{'messages': [{'role': 'system',
   'content': 'You are a helpful legal assistant. Be precise and cite pages if known.'},
  {'role': 'user',
   'content': 'Summarize the section:\nHeading: Copyright Law United States Copyri > (E) Musical works database .-\nSummarize as 6–8 concise bullet points. End with [pp. 98–134].'}],
 'response': '{ "short": "The mechanical licensing collective must establish and maintain a database containing information about musical works, shares of such works, copyright owners, and sound recordings. The collective must also provide identifying information for each sound recording and report usage data for musical works used under the blanket license.", "medium": "The mechanical licensing collective is responsible for establishing and maintaining a comprehensive database of musical works, including information on shares, copyright owners, and sound recordings. This database must be publicly accessible and include matched and unmatched works. The collective must

In [8]:
# Very quick guard: ensure every example has system+user, response
bad = 0
for p in [train_p, dev_p, test_p]:
    for line in open(p, "r", encoding="utf-8"):
        ex = json.loads(line)
        msgs = ex.get("messages", [])
        if not msgs or not isinstance(msgs, list) or not ex.get("response"):
            bad += 1
print("Malformed examples:", bad)

Malformed examples: 0


In [11]:
import json, re
from pathlib import Path

def _strip_fences(s: str) -> str:
    s = s.strip()
    # remove code fences if present
    s = re.sub(r"^```(?:json)?\s*", "", s)
    s = re.sub(r"\s*```$", "", s)
    return s.strip()

def _try_json(s: str):
    s = _strip_fences(s)
    try:
        return json.loads(s)
    except Exception:
        return None

def _extract_key_list(raw: str, key: str):
    """
    Extract a JSON-like list for 'key' even if braces aren't balanced.
    Returns a list[str] or None.
    """
    raw = _strip_fences(raw)
    # 1) full JSON fallback
    obj = _try_json(raw)
    if isinstance(obj, dict) and key in obj:
        val = obj[key]
        if isinstance(val, list):
            return [str(x).strip() for x in val if str(x).strip()]
        if isinstance(val, str) and val.strip():
            return [val.strip()]
    # 2) find [...] after "key":
    m = re.search(rf'"{re.escape(key)}"\s*:\s*\[(.*?)\]', raw, re.S)
    if m:
        block = m.group(1)
        # grab quoted strings inside
        items = re.findall(r'"([^"]+)"', block)
        if items:
            return [i.strip() for i in items if i.strip()]
        # or dash-prefixed lines
        dashes = [ln.strip("- ").strip() for ln in block.splitlines() if ln.strip().startswith("-")]
        if dashes:
            return [x for x in dashes if x]
    return None

def _extract_key_str(raw: str, key: str):
    """
    Extract a string value for 'key' even if braces aren't balanced.
    Returns str or "".
    """
    raw = _strip_fences(raw)
    # 1) full JSON fallback
    obj = _try_json(raw)
    if isinstance(obj, dict) and key in obj:
        val = obj[key]
        if isinstance(val, str):
            return val.strip()
        if isinstance(val, list):
            return "\n".join(f"- {str(x).strip()}" for x in val if str(x).strip())
    # 2) "key": "...."
    m = re.search(rf'"{re.escape(key)}"\s*:\s*"([^"]*)', raw, re.S)  # stops at next quote; ok if truncated
    if m:
        return m.group(1).strip()
    # 3) key lines like: key: value
    m = re.search(rf'\b{re.escape(key)}\b\s*[:=]\s*(.+)', raw)
    if m:
        return m.group(1).strip()
    return ""

def _bullets_from_list(lst):
    if not lst: return ""
    return "\n".join(f"- {x}" for x in lst if str(x).strip())

def sanitize_summaries(path_in: Path, path_out: Path):
    out = []
    kept = 0
    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)
        short_raw  = row.get("short",  "")
        med_raw    = row.get("medium","")
        long_raw   = row.get("long",  "")

        # try robust extraction for each field
        short = _extract_key_str(short_raw, "short") or str(short_raw).strip()
        med_list = _extract_key_list(med_raw, "medium")
        if med_list is None:
            med = _extract_key_str(med_raw, "medium") or str(med_raw).strip()
        else:
            med = _bullets_from_list(med_list)

        long_list = _extract_key_list(long_raw, "long")
        if long_list is None:
            long = _extract_key_str(long_raw, "long") or str(long_raw).strip()
        else:
            long = _bullets_from_list(long_list)

        # final cleanup: if any field still looks like a JSON object header, strip braces/keys crudely
        def _dejson(s: str) -> str:
            s = s.strip()
            s = re.sub(r'^\{.*?"\w+"\s*:\s*', "", s)  # remove leading {"short":
            s = s.replace("}", "").replace("{", "")
            return s.strip()

        for k,v in (("short", short), ("medium", med), ("long", long)):
            if v.startswith("{") or '"short"' in v or '"medium"' in v or '"long"' in v:
                v = _dejson(v)
            # collapse accidental newlines of commas
            v = re.sub(r'\s+,', ',', v)
            # trim overly long trailing fragments of unfinished JSON
            v = re.sub(r',\s*"$', '', v)
            if k == "short": short = v
            elif k == "medium": med = v
            else: long = v

        row["short"]  = short
        row["medium"] = med
        row["long"]   = long
        out.append(row); kept += 1

    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] summaries cleaned → {path_out} | rows={kept}")

def sanitize_qa(path_in: Path, path_out: Path, min_len: int = 10):
    out = []
    kept = 0; dropped = 0
    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)
        ans_raw = row.get("answer", "")
        ans_raw = _strip_fences(ans_raw)

        # 1) full JSON?
        obj = _try_json(ans_raw)
        answer = ""
        if isinstance(obj, dict) and "a" in obj:
            answer = str(obj["a"]).strip()

        # 2) '"a": ".....' regex (works even if truncated)
        if not answer:
            m = re.search(r'"a"\s*:\s*"([^"]*)', ans_raw, re.S)
            if m:
                answer = m.group(1).strip()

        # 3) last resort: if it looks like JSON, drop keys and keep tail
        if not answer and ans_raw.startswith("{"):
            answer = re.sub(r'"\w+"\s*:\s*', '', ans_raw)
            answer = answer.replace("{","").replace("}","").strip()

        # tiny garbage guard
        if len(answer) < min_len:
            dropped += 1
            continue

        row["answer"] = answer
        out.append(row); kept += 1

    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] qa cleaned → {path_out} | kept={kept} dropped={dropped}")

In [12]:
# run
S = SFT_DIR / "summaries.jsonl"
Q = SFT_DIR / "qa.jsonl"
sanitize_summaries(S, SFT_DIR / "summaries.clean.jsonl")
sanitize_qa(Q, SFT_DIR / "qa.clean.jsonl")


[OK] summaries cleaned → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\summaries.clean.jsonl | rows=40
[OK] qa cleaned → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\qa.clean.jsonl | kept=38 dropped=2


In [13]:
import json, re, itertools
S_CLEAN = SFT_DIR / "summaries.clean.jsonl"
Q_CLEAN = SFT_DIR / "qa.clean.jsonl"

def _peek(p, n=2):
    for line in itertools.islice(open(p, "r", encoding="utf-8"), n):
        print(json.loads(line))

print("Summaries sample:")
_peek(S_CLEAN, 2)

print("\nQA sample:")
_peek(Q_CLEAN, 3)

# optional: drop low-quality rows (e.g., answers still look JSON-ish)
def filter_bad_jsonish(src: Path, dst: Path):
    out = []
    for line in open(src, "r", encoding="utf-8"):
        row = json.loads(line)
        txts = [row.get("short",""), row.get("medium",""), row.get("long","")]
        if any(isinstance(t, str) and ('{"' in t or '"short"' in t or '"medium"' in t or '"long"' in t) for t in txts):
            continue
        out.append(row)
    with open(dst, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] filtered summaries → {dst} | kept={len(out)}")

# Example (only if you still see JSON-ish leakage):
# filter_bad_jsonish(S_CLEAN, SFT_DIR / "summaries.clean.filtered.jsonl")

Summaries sample:
{'section': 'Copyright Law United States Copyri > (E) Musical works database .-', 'pages': [98, 134], 'short': 'The mechanical licensing collective must establish and maintain a database containing information about musical works, shares of such works, copyright owners, and sound recordings. The collective must also provide identifying information for each sound recording and report usage data for musical works used', 'medium': 'The mechanical licensing collective is responsible for establishing and maintaining a comprehensive database of musical works, including information on shares, copyright owners, and sound recordings. This database must be publicly accessible and include matched and unmatched works. The collective must also provide identifying information for each sound recording and report usage data for musical works used under the blanket license. Additionally, the collective must safeguard confidentiality and security of sensitive data used to', 'long': '"T

In [14]:
# --- v2 cleaners: stronger regex fallbacks & shape enforcement ---

import json, re
from pathlib import Path

def _strip_fences(s: str) -> str:
    s = s.strip()
    s = re.sub(r"^```(?:json)?\s*", "", s)
    s = re.sub(r"\s*```$", "", s)
    return s.strip()

def _try_json(s: str):
    s = _strip_fences(s)
    try:
        return json.loads(s)
    except Exception:
        return None

def _quote_trim(s: str) -> str:
    s = s.strip()
    if s.startswith('"') and s.endswith('"'):
        s = s[1:-1]
    return s.strip()

def _dejsonish(s: str) -> str:
    # remove obvious JSON keys/braces left in the string
    s = s.replace("{", "").replace("}", "")
    s = re.sub(r'"\w+"\s*:\s*', "", s)
    return s.strip()

def _extract_list_from_any(raw: str, key: str | None = None):
    """
    Try very hard to extract a list of strings from 'raw'.
    If 'key' is provided, prefer that key from a dict.
    Returns list[str] or None.
    """
    txt = _strip_fences(raw)

    # 1) full JSON
    obj = _try_json(txt)
    if isinstance(obj, dict) and key and key in obj:
        val = obj[key]
        if isinstance(val, list):
            return [str(x).strip() for x in val if str(x).strip()]
        if isinstance(val, str) and val.strip():
            return [val.strip()]
    if isinstance(obj, list):
        return [str(x).strip() for x in obj if str(x).strip()]

    # 2) find [...], extract quoted strings
    m = re.search(r"\[(.+?)\]", txt, re.S)
    if m:
        block = m.group(1)
        items = re.findall(r'"([^"]+)"', block)  # "..."
        if items:
            return [i.strip() for i in items if i.strip()]
        # dash bullets inside brackets
        dashes = [ln.strip("- ").strip() for ln in block.splitlines() if ln.strip().startswith("-")]
        if dashes:
            return [x for x in dashes if x]

    # 3) naked dash bullets
    lines = [ln.strip() for ln in txt.splitlines()]
    bullets = [re.sub(r"^[-•]\s*", "", ln) for ln in lines if re.match(r"^[-•]\s+", ln)]
    if bullets:
        return [b for b in bullets if b]

    return None

_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+')

def _as_bullets_from_sentences(s: str, max_items=12):
    s = _dejsonish(_quote_trim(_strip_fences(s)))
    sents = _SENT_SPLIT.split(s)
    sents = [x.strip() for x in sents if x.strip()]
    sents = sents[:max_items]
    return [x if x.endswith(('.', '!', '?')) else x for x in sents]

def sanitize_summaries_v2(path_in: Path, path_out: Path, min_len_short=40, min_len_medium=80):
    out = []
    kept = 0
    dropped = 0

    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)

        raw_short  = str(row.get("short",""))
        raw_medium = str(row.get("medium",""))
        raw_long   = str(row.get("long",""))

        # short: try key->string, else first item from any list, else dejsonish
        short = ""
        obj_s = _try_json(raw_short)
        if isinstance(obj_s, dict) and "short" in obj_s and isinstance(obj_s["short"], str):
            short = obj_s["short"].strip()
        if not short:
            lst = _extract_list_from_any(raw_short, key="short")
            if lst:
                short = lst[0]
        if not short:
            short = _dejsonish(_quote_trim(raw_short))

        # medium: prefer list; else string; else sentence bullets
        med_list = _extract_list_from_any(raw_medium, key="medium")
        if med_list:
            medium = "\n".join(f"- {x}" for x in med_list if x)
        else:
            medium = _dejsonish(_quote_trim(raw_medium))
            if ('{"' in medium) or (len(medium) < min_len_medium):
                # convert to 6–8 bullets from sentences
                bullets = _as_bullets_from_sentences(medium, max_items=8)
                medium = "\n".join(f"- {b}" for b in bullets)

        # long: prefer list; else convert to 12–15 bullets from sentences
        long_list = _extract_list_from_any(raw_long, key="long")
        if long_list:
            long = "\n".join(f"- {x}" for x in long_list if x)
        else:
            bullets = _as_bullets_from_sentences(raw_long, max_items=15)
            long = "\n".join(f"- {b}" for b in bullets)

        # quick quality checks
        if len(short) < min_len_short:
            dropped += 1
            continue

        row["short"]  = short.strip()
        row["medium"] = medium.strip()
        row["long"]   = long.strip()
        out.append(row); kept += 1

    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] summaries v2 → {path_out} | kept={kept} dropped={dropped}")

def sanitize_qa_v2(path_in: Path, path_out: Path, min_len=50, must_end_punct=True):
    out = []
    kept = 0
    dropped = 0

    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)
        ans_raw = str(row.get("answer",""))
        ans_raw = _strip_fences(ans_raw)

        # try JSON {'q': '...', 'a': '...'}
        obj = _try_json(ans_raw)
        answer = ""
        if isinstance(obj, dict) and "a" in obj:
            answer = str(obj["a"]).strip()

        # regex '"a": "..."' even if truncated
        if not answer:
            m = re.search(r'"a"\s*:\s*"([^"]+)"', ans_raw, re.S)
            if m:
                answer = m.group(1).strip()

        # strip JSONish wrappers if still nothing
        if not answer:
            answer = _dejsonish(ans_raw)

        # guards
        if len(answer) < min_len:
            dropped += 1
            continue
        if must_end_punct and answer[-1:] not in ".!?]”\"'":
            # permit bracketed citations to pass
            answer += "."

        row["answer"] = answer
        out.append(row); kept += 1

    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] qa v2 → {path_out} | kept={kept} dropped={dropped}")


In [15]:
S = SFT_DIR / "summaries.clean.jsonl"
Q = SFT_DIR / "qa.clean.jsonl"

sanitize_summaries_v2(S, SFT_DIR / "summaries.v2.jsonl")
sanitize_qa_v2(Q, SFT_DIR / "qa.v2.jsonl")

# take a quick look
import itertools
print("Summaries v2 sample:")
for ln in itertools.islice(open(SFT_DIR / "summaries.v2.jsonl", "r", encoding="utf-8"), 2):
    print(json.loads(ln))
print("\nQA v2 sample:")
for ln in itertools.islice(open(SFT_DIR / "qa.v2.jsonl", "r", encoding="utf-8"), 3):
    print(json.loads(ln))


[OK] summaries v2 → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\summaries.v2.jsonl | kept=40 dropped=0
[OK] qa v2 → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\qa.v2.jsonl | kept=36 dropped=2
Summaries v2 sample:
{'section': 'Copyright Law United States Copyri > (E) Musical works database .-', 'pages': [98, 134], 'short': 'The mechanical licensing collective must establish and maintain a database containing information about musical works, shares of such works, copyright owners, and sound recordings. The collective must also provide identifying information for each sound recording and report usage data for musical works used', 'medium': 'The mechanical licensing collective is responsible for establishing and maintaining a comprehensive database of musical works, including information on shares, copyright owners, and sound recordings. This database must be publicly accessible and include matched and unmatched works. The collective must also provide identi

In [16]:
# --- v3 polish: clip to sentence boundaries, re-bulletize, tidy punctuation ---
import json, re
from pathlib import Path

_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+')

def _end_at_sentence(s: str) -> str:
    s = s.strip().replace("\u2014", "-")
    # cut at the last sentence boundary if trailing looks truncated
    last = max(s.rfind("."), s.rfind("?"), s.rfind("!"))
    if last != -1 and last >= len(s) * 0.5:  # only if we have at least half a sentence
        return s[:last+1].strip()
    # otherwise ensure it ends with punctuation
    return s if s.endswith(('.', '!', '?', ']', '"', "’", "”")) else (s + ".")

def _clean_line(s: str) -> str:
    s = s.strip().strip('"').strip()
    # remove stray JSON-ish key labels that slipped through
    s = re.sub(r'^\s*"(short|medium|long)"\s*:\s*', "", s, flags=re.I)
    s = re.sub(r'^\s*[\[\]\{\}],?\s*', "", s)
    s = re.sub(r'\s*\[\s*$', "", s)
    s = re.sub(r'^\s*-\s*"', "- ", s)
    return s

def _as_bullets(text: str, max_items=12, min_len=20):
    # if already bulletized, normalize
    lines = [ln for ln in (text or "").splitlines() if ln.strip()]
    bullets = []
    if any(ln.lstrip().startswith(("-", "•")) for ln in lines):
        for ln in lines:
            if ln.lstrip().startswith(("-", "•")):
                item = re.sub(r'^\s*[-•]\s*', "", ln).strip().strip('"')
                if len(item) >= min_len:
                    bullets.append(_end_at_sentence(_clean_line(item)))
    else:
        # make bullets from sentences
        sents = [t.strip() for t in _SENT_SPLIT.split(text or "") if t.strip()]
        for s in sents:
            if len(s) >= min_len:
                bullets.append(_end_at_sentence(_clean_line(s)))
    # trim and cap
    bullets = [b for b in bullets if b and not b.endswith((":", ";"))]
    return [f"- {b}" for b in bullets[:max_items]]

def sanitize_summaries_v3(path_in: Path, path_out: Path,
                          min_len_short=60, min_len_medium=120,
                          max_med_bullets=8, max_long_bullets=15):
    kept = dropped = 0
    out = []

    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)
        short  = _end_at_sentence(_clean_line(str(row.get("short",""))))
        medium = str(row.get("medium","")).strip()
        long   = str(row.get("long","")).strip()

        # build bullets
        med_bullets  = _as_bullets(medium, max_items=max_med_bullets)
        long_bullets = _as_bullets(long,   max_items=max_long_bullets)

        # repair short if too short: borrow first bullet(s) from medium
        if len(short) < min_len_short and med_bullets:
            short = _end_at_sentence(re.sub(r"^-\s*", "", med_bullets[0]))

        # if medium too thin, synthesize from long or short
        med_text = "\n".join(med_bullets) if len("\n".join(med_bullets)) >= min_len_medium else \
                   ("\n".join(long_bullets[:max_med_bullets]) if long_bullets else "\n".join(_as_bullets(short, 6)))

        row["short"]  = short
        row["medium"] = med_text.strip()
        row["long"]   = "\n".join(long_bullets).strip() if long_bullets else row["medium"]

        # final guards
        if len(row["short"]) < min_len_short:
            dropped += 1
            continue

        out.append(row); kept += 1

    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] summaries v3 → {path_out} | kept={kept} dropped={dropped}")

def sanitize_qa_v3(path_in: Path, path_out: Path, min_len=60):
    kept = dropped = 0
    out = []
    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)
        ans = _end_at_sentence(_clean_line(str(row.get("answer",""))))
        # drop obviously clipped answers
        if len(ans) < min_len:
            dropped += 1
            continue
        row["answer"] = ans
        out.append(row); kept += 1
    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] qa v3 → {path_out} | kept={kept} dropped={dropped}")


In [20]:
summ_v3 = SFT_DIR / "summaries.v3.jsonl"
qa_v3   = SFT_DIR / "qa.v3.jsonl"

sanitize_summaries_v3(SFT_DIR / "summaries.v2.jsonl", summ_v3)
sanitize_qa_v3(SFT_DIR / "qa.v2.jsonl", qa_v3)

# quick peek
import json, itertools
for ln in itertools.islice(open(summ_v3, "r", encoding="utf-8"), 2):
    print(json.loads(ln))
for ln in itertools.islice(open(qa_v3, "r", encoding="utf-8"), 2):
    print(json.loads(ln))


[OK] summaries v3 → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\summaries.v3.jsonl | kept=40 dropped=0
[OK] qa v3 → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\qa.v3.jsonl | kept=34 dropped=2
{'section': 'Copyright Law United States Copyri > (E) Musical works database .-', 'pages': [98, 134], 'short': 'The mechanical licensing collective must establish and maintain a database containing information about musical works, shares of such works, copyright owners, and sound recordings.', 'medium': '- The mechanical licensing collective is responsible for establishing and maintaining a comprehensive database of musical works, including information on shares, copyright owners, and sound recordings.\n- This database must be publicly accessible and include matched and unmatched works.\n- The collective must also provide identifying information for each sound recording and report usage data for musical works used under the blanket license.\n- Additionally, the coll

In [21]:
import json, re

summ_v3  = SFT_DIR / "summaries.v3.jsonl"
qa_v3    = SFT_DIR / "qa.v3.jsonl"
summ_v3a = SFT_DIR / "summaries.v3a.jsonl"
qa_v3a   = SFT_DIR / "qa.v3a.jsonl"

_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+')

def _end_at_sentence(s: str) -> str:
    s = s.strip().replace("\u2014","-")
    # drop trailing open bracketed citation fragments (e.g., "[p.")
    s = re.sub(r"\s*\[[^\]]*$", "", s)
    # if no terminal punctuation, add a period
    if not s.endswith(('.', '!', '?')):
        s += '.'
    # if the final sentence is extremely short (common after truncation), drop it
    parts = [p.strip() for p in _SENT_SPLIT.split(s) if p.strip()]
    if parts and len(parts[-1]) < 8 and len(parts) > 1:
        s = " ".join(parts[:-1]).strip()
        if not s.endswith(('.', '!', '?')):
            s += '.'
    return s

def _hard_clean_text(s: str) -> str:
    # remove JSON-ish wrappers & dangling quotes/arrays/objects
    s = s.replace('\n"', '\n').replace('", "', '. ')
    s = re.sub(r'^\s*["\[\{]+', '', s).strip()
    s = re.sub(r'["\]\}]+\.?\s*$', '', s).strip()
    s = re.sub(r'\s+",\s*"', '. ', s)
    s = s.replace('."', '.').replace('".', '.')
    # trim trailing “as well as …” style unfinished phrases
    s = re.sub(r'(as well as|including but not limited to|including)\s*$', '', s, flags=re.I)
    s = re.sub(r'\s{2,}', ' ', s)
    return s.strip()

def _as_bullets_from_sentences(text: str, max_items=12, min_len=20):
    # Always recompute bullets from sentences to avoid carrying broken formatting forward
    s = _hard_clean_text(text or "")
    sents = [t.strip() for t in _SENT_SPLIT.split(s) if t.strip()]
    out = []
    for t in sents:
        t = _end_at_sentence(t)
        if len(t) >= min_len and not t.endswith((":", ";")):
            out.append(f"- {t}")
        if len(out) >= max_items: break
    return out

def clean_summaries_v3a(path_in: Path, path_out: Path,
                        min_len_short=60,
                        med_items=6, long_items=12):
    kept = dropped = 0
    out = []
    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)
        short  = _end_at_sentence(_hard_clean_text(str(row.get("short",""))))
        medium = str(row.get("medium",""))
        long   = str(row.get("long",""))

        med_bullets  = _as_bullets_from_sentences(medium, max_items=med_items)
        long_bullets = _as_bullets_from_sentences(long,   max_items=long_items)

        # if medium is still thin, synthesize from long or short
        if len(med_bullets) < max(3, med_items//2):
            med_bullets = _as_bullets_from_sentences(long or short, max_items=med_items)

        # ensure short has at least one solid sentence
        if len(short) < min_len_short and med_bullets:
            short = med_bullets[0][2:]  # drop "- "

        row["short"]  = short
        row["medium"] = "\n".join(med_bullets).strip()
        row["long"]   = "\n".join(long_bullets).strip() or row["medium"]

        if len(row["short"]) < min_len_short:
            dropped += 1
            continue
        out.append(row); kept += 1

    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] summaries v3a → {path_out} | kept={kept} dropped={dropped}")

def clean_qa_v3a(path_in: Path, path_out: Path, min_len=60):
    kept = dropped = 0
    out = []
    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)
        ans = _hard_clean_text(str(row.get("answer","")))
        ans = _end_at_sentence(ans)
        # if answer still too short (typical clipped artifacts), drop
        if len(ans) < min_len:
            dropped += 1
            continue
        row["answer"] = ans
        out.append(row); kept += 1
    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] qa v3a → {path_out} | kept={kept} dropped={dropped}")

In [22]:
# Run
clean_summaries_v3a(summ_v3, summ_v3a)
clean_qa_v3a(qa_v3, qa_v3a)

# Quick peek
import itertools
print("\nSummaries v3a sample:")
for ln in itertools.islice(open(summ_v3a, "r", encoding="utf-8"), 2):
    print(json.loads(ln))
print("\nQA v3a sample:")
for ln in itertools.islice(open(qa_v3a, "r", encoding="utf-8"), 2):
    print(json.loads(ln))


[OK] summaries v3a → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\summaries.v3a.jsonl | kept=40 dropped=0
[OK] qa v3a → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\qa.v3a.jsonl | kept=34 dropped=0

Summaries v3a sample:
{'section': 'Copyright Law United States Copyri > (E) Musical works database .-', 'pages': [98, 134], 'short': 'The mechanical licensing collective must establish and maintain a database containing information about musical works, shares of such works, copyright owners, and sound recordings.', 'medium': '- - The mechanical licensing collective is responsible for establishing and maintaining a comprehensive database of musical works, including information on shares, copyright owners, and sound recordings.\n- - This database must be publicly accessible and include matched and unmatched works.\n- - The collective must also provide identifying information for each sound recording and report usage data for musical works used under the blanket l

In [23]:
from src.train.sft_build import build_sft
train_p, dev_p, test_p = build_sft(
    summaries_path=summ_v3a,
    qa_path=qa_v3a,
    out_dir=SFT_DIR,
    train_frac=0.85, dev_frac=0.075, test_frac=0.075
)
print(train_p, dev_p, test_p)

[OK] SFT examples: total=154 | train=130 dev=11 test=13
D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\train.jsonl D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\dev.jsonl D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\test.jsonl


In [24]:
import json, re, itertools

summ_in  = SFT_DIR / "summaries.v3a.jsonl"
qa_in    = SFT_DIR / "qa.v3a.jsonl"
summ_out = SFT_DIR / "summaries.v3b.jsonl"
qa_out   = SFT_DIR / "qa.v3b.jsonl"

_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+')
_STOP_LAST  = {"the","and","or","of","to","for","with","in","on","at","by"}

def _strip_bullet_prefix(s: str) -> str:
    # drop any existing bullet-like prefix and stray quotes
    s = re.sub(r'^\s*[-•·]+[\s\u00A0]+', '', s.strip())
    s = s.lstrip('"\''"“”‘’").strip()
    return s

def _normalize_punct(s: str) -> str:
    s = s.replace('..', '.').replace(',.', '.').replace('",', ',').replace('".', '.')
    s = re.sub(r'\s{2,}', ' ', s)
    return s.strip()

def _end_at_sentence(s: str) -> str:
    # remove any dangling bracketed fragments like "[p."
    s = re.sub(r'\s*\[[^\]]*$', '', s.strip())
    if not s.endswith(('.', '!', '?')):
        s += '.'
    parts = [p.strip() for p in _SENT_SPLIT.split(s) if p.strip()]
    # drop ultra-short last sentence if it’s just a fragment
    if parts and len(parts[-1]) < 8 and len(parts) > 1:
        s = " ".join(parts[:-1]).strip()
        if not s.endswith(('.', '!', '?')):
            s += '.'
    return s

def _bullets_from_text(raw: str, max_items: int = 12, min_len: int = 20):
    # split into sentences, strip pre-existing bullets, re-bulletize cleanly
    sents = [t.strip() for t in _SENT_SPLIT.split(raw or "") if t.strip()]
    out = []
    for t in sents:
        t = _strip_bullet_prefix(_normalize_punct(t))
        t = _end_at_sentence(t)
        if len(t) >= min_len and not t.endswith((':',';')):
            out.append(f"- {t}")
        if len(out) >= max_items: break
    return out

def clean_summaries_v3b(path_in: Path, path_out: Path,
                        med_items=6, long_items=12, min_len_short=60):
    kept = dropped = 0
    out = []
    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)

        short = _normalize_punct(_strip_bullet_prefix(str(row.get("short",""))))
        short = _end_at_sentence(short)

        med_bul = _bullets_from_text(str(row.get("medium","")), max_items=med_items)
        if len(med_bul) < max(3, med_items//2):
            # backfill from long/short if needed
            med_bul = _bullets_from_text(str(row.get("long","")) or short, max_items=med_items)

        long_bul = _bullets_from_text(str(row.get("long","")), max_items=long_items)
        if not long_bul:
            long_bul = med_bul

        row["short"]  = short
        row["medium"] = "\n".join(med_bul)
        row["long"]   = "\n".join(long_bul)

        # sanity
        if len(row["short"]) < min_len_short:
            dropped += 1
            continue
        out.append(row); kept += 1

    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] summaries v3b → {path_out} | kept={kept} dropped={dropped}")

def _trim_if_ends_on_stopword(ans: str) -> str:
    # If the last token is a stopword (e.g., “… the.”), trim to previous sentence
    toks = re.findall(r"[A-Za-z]+", ans)
    if toks and toks[-1].lower() in _STOP_LAST:
        m = re.search(r'(?s)(.*[.!?])\s+[^.!?]*$', ans)
        if m:
            return m.group(1).strip()
    return ans

def clean_qa_v3b(path_in: Path, path_out: Path, min_len=60):
    kept = dropped = 0
    out = []
    for line in open(path_in, "r", encoding="utf-8"):
        row = json.loads(line)
        ans = _normalize_punct(str(row.get("answer","")).strip())
        ans = _end_at_sentence(ans)
        ans = _trim_if_ends_on_stopword(ans)
        if len(ans) < min_len:
            dropped += 1
            continue
        row["answer"] = ans
        out.append(row); kept += 1

    with open(path_out, "w", encoding="utf-8") as f:
        for r in out:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"[OK] qa v3b → {path_out} | kept={kept} dropped={dropped}")

# Run
clean_summaries_v3b(summ_in, summ_out)
clean_qa_v3b(qa_in, qa_out)

# Peek
print("\nSummaries v3b sample:")
for ln in itertools.islice(open(summ_out, "r", encoding="utf-8"), 2):
    print(json.loads(ln))
print("\nQA v3b sample:")
for ln in itertools.islice(open(qa_out, "r", encoding="utf-8"), 2):
    print(json.loads(ln))


[OK] summaries v3b → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\summaries.v3b.jsonl | kept=40 dropped=0
[OK] qa v3b → D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\qa.v3b.jsonl | kept=34 dropped=0

Summaries v3b sample:
{'section': 'Copyright Law United States Copyri > (E) Musical works database .-', 'pages': [98, 134], 'short': 'The mechanical licensing collective must establish and maintain a database containing information about musical works, shares of such works, copyright owners, and sound recordings.', 'medium': '- - The mechanical licensing collective is responsible for establishing and maintaining a comprehensive database of musical works, including information on shares, copyright owners, and sound recordings.\n- - This database must be publicly accessible and include matched and unmatched works.\n- - The collective must also provide identifying information for each sound recording and report usage data for musical works used under the blanket l

In [25]:
from src.train.sft_build import build_sft
train_p, dev_p, test_p = build_sft(
    summaries_path=summ_out,
    qa_path=qa_out,
    out_dir=SFT_DIR,
    train_frac=0.85, dev_frac=0.075, test_frac=0.075
)
print(train_p, dev_p, test_p)


[OK] SFT examples: total=154 | train=130 dev=11 test=13
D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\train.jsonl D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\dev.jsonl D:\IIT BBS\Job Resources\Business Optima\pdf-agent\data\sft\test.jsonl
