In [2]:
import os, glob, json, xml.etree.ElementTree as ET
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# ── CONFIG ──
NXML_DIR      = "statpearls_NBK430685"
JSONL_PATH    = "rag-dataset.jsonl"
FAISS_INDEX   = "rag-index.faiss"
META_PATH     = "rag-metadata.json"
MODEL_NAME    = "sentence-transformers/all-mpnet-base-v2"
# ────────────

model = SentenceTransformer(MODEL_NAME)

def clean_text(elem):
    pieces = []
    for p in elem.findall(".//p"):
        t = "".join(p.itertext()).strip()
        if t: pieces.append(t)
    for li in elem.findall(".//list-item"):
        t = "".join(li.itertext()).strip()
        if t: pieces.append("• " + t)
    return "\n\n".join(pieces)

def extract_sections(path):
    tree = ET.parse(path)
    root = tree.getroot()
    doc_id = root.attrib.get("id", os.path.basename(path))
    title_el = root.find(".//book-part-meta/title-group/title")
    doc_title = title_el.text.strip() if title_el is not None else doc_id

    EXCLUDE = {"review questions", "references", "learning outcome", "continuing education activity"}
    for sec in root.findall(".//body/sec"):
        st = sec.find("title")
        stitle = st.text.strip() if st is not None else ""
        if any(exc in stitle.lower() for exc in EXCLUDE):
            continue
        text = clean_text(sec)
        if text:
            yield {
                "doc_id": doc_id,
                "doc_title": doc_title,
                "section_title": stitle,
                "text": text
            }

# Step 1: Extract & save JSONL
with open(JSONL_PATH, "w", encoding="utf-8") as out:
    for path in glob.glob(os.path.join(NXML_DIR, "**", "*.nxml"), recursive=True):
        for rec in extract_sections(path):
            out.write(json.dumps(rec, ensure_ascii=False) + "\n")
print("✅ Wrote", JSONL_PATH)

# Step 2: Load records
records = []
with open(JSONL_PATH, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        records.append(json.loads(line))
print("🔍 Loaded", len(records), "sections")

# Step 3: Embed in chunks
BATCH_SIZE = 64
texts = [r["text"] for r in records]
embeddings = []
for i in range(0, len(texts), BATCH_SIZE):
    chunk = texts[i:i+BATCH_SIZE]
    embs = model.encode(chunk, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
    embeddings.append(embs)
    print(f"🔹 Embedded {i + len(chunk)} / {len(texts)}")

emb_array = np.vstack(embeddings)

# Step 4: Create FAISS index
dim = emb_array.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(emb_array)
faiss.write_index(index, FAISS_INDEX)
print("✅ FAISS index saved to", FAISS_INDEX)

# Step 5: Save metadata
meta = [{"doc_id": r["doc_id"], "doc_title": r["doc_title"], "section_title": r["section_title"]} for r in records]
with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print("✅ Metadata saved to", META_PATH)

✅ Wrote rag-dataset.jsonl
🔍 Loaded 99056 sections
🔹 Embedded 64 / 99056
🔹 Embedded 128 / 99056
🔹 Embedded 192 / 99056
🔹 Embedded 256 / 99056
🔹 Embedded 320 / 99056
🔹 Embedded 384 / 99056
🔹 Embedded 448 / 99056
🔹 Embedded 512 / 99056
🔹 Embedded 576 / 99056
🔹 Embedded 640 / 99056
🔹 Embedded 704 / 99056
🔹 Embedded 768 / 99056
🔹 Embedded 832 / 99056
🔹 Embedded 896 / 99056
🔹 Embedded 960 / 99056
🔹 Embedded 1024 / 99056
🔹 Embedded 1088 / 99056
🔹 Embedded 1152 / 99056
🔹 Embedded 1216 / 99056
🔹 Embedded 1280 / 99056
🔹 Embedded 1344 / 99056
🔹 Embedded 1408 / 99056
🔹 Embedded 1472 / 99056
🔹 Embedded 1536 / 99056
🔹 Embedded 1600 / 99056
🔹 Embedded 1664 / 99056
🔹 Embedded 1728 / 99056
🔹 Embedded 1792 / 99056
🔹 Embedded 1856 / 99056
🔹 Embedded 1920 / 99056
🔹 Embedded 1984 / 99056
🔹 Embedded 2048 / 99056
🔹 Embedded 2112 / 99056
🔹 Embedded 2176 / 99056
🔹 Embedded 2240 / 99056
🔹 Embedded 2304 / 99056
🔹 Embedded 2368 / 99056
🔹 Embedded 2432 / 99056
🔹 Embedded 2496 / 99056
🔹 Embedded 2560 / 99056
🔹 Embe

🔹 Embedded 20544 / 99056
🔹 Embedded 20608 / 99056
🔹 Embedded 20672 / 99056
🔹 Embedded 20736 / 99056
🔹 Embedded 20800 / 99056
🔹 Embedded 20864 / 99056
🔹 Embedded 20928 / 99056
🔹 Embedded 20992 / 99056
🔹 Embedded 21056 / 99056
🔹 Embedded 21120 / 99056
🔹 Embedded 21184 / 99056
🔹 Embedded 21248 / 99056
🔹 Embedded 21312 / 99056
🔹 Embedded 21376 / 99056
🔹 Embedded 21440 / 99056
🔹 Embedded 21504 / 99056
🔹 Embedded 21568 / 99056
🔹 Embedded 21632 / 99056
🔹 Embedded 21696 / 99056
🔹 Embedded 21760 / 99056
🔹 Embedded 21824 / 99056
🔹 Embedded 21888 / 99056
🔹 Embedded 21952 / 99056
🔹 Embedded 22016 / 99056
🔹 Embedded 22080 / 99056
🔹 Embedded 22144 / 99056
🔹 Embedded 22208 / 99056
🔹 Embedded 22272 / 99056
🔹 Embedded 22336 / 99056
🔹 Embedded 22400 / 99056
🔹 Embedded 22464 / 99056
🔹 Embedded 22528 / 99056
🔹 Embedded 22592 / 99056
🔹 Embedded 22656 / 99056
🔹 Embedded 22720 / 99056
🔹 Embedded 22784 / 99056
🔹 Embedded 22848 / 99056
🔹 Embedded 22912 / 99056
🔹 Embedded 22976 / 99056
🔹 Embedded 23040 / 99056


🔹 Embedded 40768 / 99056
🔹 Embedded 40832 / 99056
🔹 Embedded 40896 / 99056
🔹 Embedded 40960 / 99056
🔹 Embedded 41024 / 99056
🔹 Embedded 41088 / 99056
🔹 Embedded 41152 / 99056
🔹 Embedded 41216 / 99056
🔹 Embedded 41280 / 99056
🔹 Embedded 41344 / 99056
🔹 Embedded 41408 / 99056
🔹 Embedded 41472 / 99056
🔹 Embedded 41536 / 99056
🔹 Embedded 41600 / 99056
🔹 Embedded 41664 / 99056
🔹 Embedded 41728 / 99056
🔹 Embedded 41792 / 99056
🔹 Embedded 41856 / 99056
🔹 Embedded 41920 / 99056
🔹 Embedded 41984 / 99056
🔹 Embedded 42048 / 99056
🔹 Embedded 42112 / 99056
🔹 Embedded 42176 / 99056
🔹 Embedded 42240 / 99056
🔹 Embedded 42304 / 99056
🔹 Embedded 42368 / 99056
🔹 Embedded 42432 / 99056
🔹 Embedded 42496 / 99056
🔹 Embedded 42560 / 99056
🔹 Embedded 42624 / 99056
🔹 Embedded 42688 / 99056
🔹 Embedded 42752 / 99056
🔹 Embedded 42816 / 99056
🔹 Embedded 42880 / 99056
🔹 Embedded 42944 / 99056
🔹 Embedded 43008 / 99056
🔹 Embedded 43072 / 99056
🔹 Embedded 43136 / 99056
🔹 Embedded 43200 / 99056
🔹 Embedded 43264 / 99056


🔹 Embedded 60992 / 99056
🔹 Embedded 61056 / 99056
🔹 Embedded 61120 / 99056
🔹 Embedded 61184 / 99056
🔹 Embedded 61248 / 99056
🔹 Embedded 61312 / 99056
🔹 Embedded 61376 / 99056
🔹 Embedded 61440 / 99056
🔹 Embedded 61504 / 99056
🔹 Embedded 61568 / 99056
🔹 Embedded 61632 / 99056
🔹 Embedded 61696 / 99056
🔹 Embedded 61760 / 99056
🔹 Embedded 61824 / 99056
🔹 Embedded 61888 / 99056
🔹 Embedded 61952 / 99056
🔹 Embedded 62016 / 99056
🔹 Embedded 62080 / 99056
🔹 Embedded 62144 / 99056
🔹 Embedded 62208 / 99056
🔹 Embedded 62272 / 99056
🔹 Embedded 62336 / 99056
🔹 Embedded 62400 / 99056
🔹 Embedded 62464 / 99056
🔹 Embedded 62528 / 99056
🔹 Embedded 62592 / 99056
🔹 Embedded 62656 / 99056
🔹 Embedded 62720 / 99056
🔹 Embedded 62784 / 99056
🔹 Embedded 62848 / 99056
🔹 Embedded 62912 / 99056
🔹 Embedded 62976 / 99056
🔹 Embedded 63040 / 99056
🔹 Embedded 63104 / 99056
🔹 Embedded 63168 / 99056
🔹 Embedded 63232 / 99056
🔹 Embedded 63296 / 99056
🔹 Embedded 63360 / 99056
🔹 Embedded 63424 / 99056
🔹 Embedded 63488 / 99056


🔹 Embedded 81216 / 99056
🔹 Embedded 81280 / 99056
🔹 Embedded 81344 / 99056
🔹 Embedded 81408 / 99056
🔹 Embedded 81472 / 99056
🔹 Embedded 81536 / 99056
🔹 Embedded 81600 / 99056
🔹 Embedded 81664 / 99056
🔹 Embedded 81728 / 99056
🔹 Embedded 81792 / 99056
🔹 Embedded 81856 / 99056
🔹 Embedded 81920 / 99056
🔹 Embedded 81984 / 99056
🔹 Embedded 82048 / 99056
🔹 Embedded 82112 / 99056
🔹 Embedded 82176 / 99056
🔹 Embedded 82240 / 99056
🔹 Embedded 82304 / 99056
🔹 Embedded 82368 / 99056
🔹 Embedded 82432 / 99056
🔹 Embedded 82496 / 99056
🔹 Embedded 82560 / 99056
🔹 Embedded 82624 / 99056
🔹 Embedded 82688 / 99056
🔹 Embedded 82752 / 99056
🔹 Embedded 82816 / 99056
🔹 Embedded 82880 / 99056
🔹 Embedded 82944 / 99056
🔹 Embedded 83008 / 99056
🔹 Embedded 83072 / 99056
🔹 Embedded 83136 / 99056
🔹 Embedded 83200 / 99056
🔹 Embedded 83264 / 99056
🔹 Embedded 83328 / 99056
🔹 Embedded 83392 / 99056
🔹 Embedded 83456 / 99056
🔹 Embedded 83520 / 99056
🔹 Embedded 83584 / 99056
🔹 Embedded 83648 / 99056
🔹 Embedded 83712 / 99056
