In [4]:
import json
import hashlib
from pathlib import Path
from webagent.framework.ingestion.loaders import load_pdf
from webagent.framework.ingestion.cleaners import basic_clean
from webagent.framework.ingestion.chunker import chunk_text

# ========= 配置 =========
pdf_path = Path("/Users/juntao/Desktop/SJTU/ptjob/WebAIAgent-202508-withYubo/STIMULIZE-WebAgent/files/STIMULIZE_UserManual_08042025.pdf")
out_path = Path("data/processed/manual_chunks_test.jsonl")
chunk_size = 800
overlap = 150
ocr_images = True
ocr_lang = "eng+chi_sim"
# =======================

out_path.parent.mkdir(parents=True, exist_ok=True)

def _hash_id(text: str, source: str, page: int, ctype: str, idx: int) -> str:
    h = hashlib.sha1()
    h.update(text.encode("utf-8"))
    h.update(f"|{source}|{page}|{ctype}|{idx}".encode("utf-8"))
    return h.hexdigest()[:16]

def _dedup(chunks):
    seen = set()
    out = []
    for c in chunks:
        key = hashlib.md5(c["text"].encode("utf-8")).hexdigest()
        if key in seen:
            continue
        seen.add(key)
        out.append(c)
    return out

# 加载 PDF
data = load_pdf(str(pdf_path), ocr_images=ocr_images, ocr_lang=ocr_lang)

all_chunks = []

# 文本页
for p in data.get("text_pages", []):
    clean = basic_clean(p.text)
    if not clean:
        continue
    pieces = chunk_text(clean, size=chunk_size, overlap=overlap)
    for i, ch in enumerate(pieces, 1):
        all_chunks.append({
            "id": _hash_id(ch, pdf_path.name, p.page, "text", i),
            "text": ch,
            "metadata": {"source": pdf_path.name, "page": p.page, "type": "text"},
        })

# 图片 OCR
for im in data.get("image_ocr", []):
    clean = basic_clean(im.ocr_text)
    if not clean:
        continue
    pieces = chunk_text(clean, size=chunk_size, overlap=overlap)
    for i, ch in enumerate(pieces, 1):
        all_chunks.append({
            "id": _hash_id(ch, pdf_path.name, im.page, "image_ocr", i),
            "text": ch,
            "metadata": {"source": pdf_path.name, "page": im.page, "type": "image_ocr"},
        })

before = len(all_chunks)
all_chunks = _dedup(all_chunks)
after = len(all_chunks)

with out_path.open("w", encoding="utf-8") as f:
    for ch in all_chunks:
        f.write(json.dumps(ch, ensure_ascii=False) + "\n")

print(f"✅ 去重前 {before} 个，去重后 {after} 个 → {out_path}")


✅ 去重前 127 个，去重后 127 个 → data/processed/manual_chunks_test.jsonl
