In [1]:
! pip install langsmith openai langfuse
! pip install -qU requests bs4 lxml chromadb langchain langchain-text-splitters langchain-openai
! pip install -qU duckduckgo-search langchain-community ddgs

Collecting langfuse
  Downloading langfuse-3.8.1-py3-none-any.whl.metadata (2.4 kB)
Collecting wrapt<2.0,>=1.14 (from langfuse)
  Downloading wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (6.4 kB)
Collecting opentelemetry-exporter-otlp-proto-common==1.37.0 (from opentelemetry-exporter-otlp-proto-http<2.0.0,>=1.33.1->langfuse)
  Downloading opentelemetry_exporter_otlp_proto_common-1.37.0-py3-none-any.whl.metadata (1.8 kB)
Collecting opentelemetry-proto==1.37.0 (from opentelemetry-exporter-otlp-proto-http<2.0.0,>=1.33.1->langfuse)
  Downloading opentelemetry_proto-1.37.0-py3-none-any.whl.metadata (2.3 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.33.1 (from langfuse)
  Downloading opentelemetry_sdk-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-api<2.0.0,>=1.33.1 (from langfuse)
  Downloading opentelemetry_api-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-semantic-conventions==0.58b0 (from opentele

In [2]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [3]:
# kb_en_to_chroma.py  — minimal & direct
import os, re, time, requests
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

BASE = "https://www.kapitalbank.az"
START = f"{BASE}/en"
UA = {"User-Agent": "kb-minicrawl/0.2"}
TIMEOUT = 15
MAX_PAGES = 50

def clean_url(u):
    u = urldefrag(u)[0]
    if not u: return None
    if not u.startswith("http"): u = urljoin(BASE, u)
    if not u.startswith(START): return None
    if re.search(r"\.(pdf|jpe?g|png|gif|svg|mp4|zip|docx?|xlsx?)$", u, re.I): return None
    return u

def extract_text(html):
    s = BeautifulSoup(html, "lxml")
    for t in s(["script","style","noscript","svg","footer","nav","header"]): t.decompose()
    n = s.select_one("main") or s.select_one("article") or s.body or s
    return " ".join((n.get_text(" ", strip=True) if n else s.get_text(" ", strip=True)).split())

visited, queue, pages = set(), [START], []
while queue and len(visited) < MAX_PAGES:
    url = queue.pop(0)
    if url in visited: continue
    try:
        r = requests.get(url, headers=UA, timeout=TIMEOUT)
        if r.ok and "text/html" in r.headers.get("Content-Type",""):
            txt = extract_text(r.text)
            if len(txt) > 200:
                pages.append({"url": url, "text": txt})
            s = BeautifulSoup(r.text, "lxml")
            for a in s.find_all("a", href=True):
                u = clean_url(a["href"])
                if u and u not in visited:
                    queue.append(u)
        visited.add(url); time.sleep(0.15)
    except requests.RequestException:
        visited.add(url)

import json

# Save the crawled pages data to a file for later use
pages_outfile = "kapitalbank_pages.json"
with open(pages_outfile, "w", encoding="utf-8") as f:
    json.dump(pages, f, indent=2, ensure_ascii=False)
print(f"Saved {len(pages)} pages to {pages_outfile}")

# Load crawled pages from JSON file to make them available for Chroma processing
with open(pages_outfile, "r", encoding="utf-8") as f:
    pages = json.load(f)
print(f"Loaded {len(pages)} pages from {pages_outfile}")

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# ---- LangChain chunking ----
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
docs, metas = [], []
for p in pages:
    for chunk in splitter.split_text(p["text"]):
        docs.append(chunk)
        metas.append({"url": p["url"]})

# ---- OpenAI embeddings -> Chroma ----
persist_dir = "chroma_kapitalbank"
emb = OpenAIEmbeddings(model="text-embedding-3-small")  # cheap & solid
vs = Chroma.from_texts(
    texts=docs,
    embedding=emb,
    persist_directory=persist_dir,
    collection_name="kapitalbank_en",
    metadatas=metas,
)
vs.persist()
print(f"Indexed pages={len(pages)} chunks={len(docs)} into {persist_dir}/ (collection 'kapitalbank_en')")

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_dir = "chroma_kapitalbank"
collection_name = "kapitalbank_en"
emb = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing/persisted Chroma vector store
vs = Chroma(
    persist_directory=persist_dir,
    embedding_function=emb,
    collection_name=collection_name
)

Saved 39 pages to kapitalbank_pages.json
Loaded 39 pages from kapitalbank_pages.json
Indexed pages=39 chunks=161 into chroma_kapitalbank/ (collection 'kapitalbank_en')


  vs.persist()
  vs = Chroma(


In [4]:
# ========= PARAMS (from your command) =========
MODE = "pages"  # also supports "newsgroups" baseline if you toggle it
PAGES_FILE = pages_outfile
WEBSITE = BASE
PERSIST_DIR_STR = persist_dir
COLLECTION = collection_name
EMBEDDING_MODEL = "text-embedding-3-small"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 120
K_LIST = [1, 3, 5, 10]
CONCURRENCY = [1, 5, 10]
SAMPLE_QUERIES = 300
ACL_ALLOW_PREFIXES = [BASE]

# Security toggles
ENABLE_PII_MASKING = False              # set True to anonymize before ingestion
STORE_RAW_HASH = True                   # store hash of raw chunk in metadata (not the raw text)
DELETE_PREFIX = None                    # e.g. "https://www.hsbc.com/media/" if you want to demo deletion

# Outputs
MAKE_PLOTS = True
MAKE_REPORT = True
LLM_MODEL = "gpt-4o-mini"               # requires OPENAI_API_KEY in env
SHOW_INLINE = False                     # True to display images/HTML inline in notebook

# ========= Imports =========
import json, math, os, random, re, time, hashlib, statistics, base64
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd

# plotting (headless save-to-file; will display inline if SHOW_INLINE=True)
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# (Optional) labeled baseline support
try:
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.utils import Bunch
except Exception:
    fetch_20newsgroups = None
    Bunch = Any  # type: ignore

import os
from dotenv import load_dotenv

load_dotenv(override=True)

False

In [5]:
# ========= Utilities =========
class Timer:
    def __enter__(self): self.t0 = time.perf_counter(); return self
    def __exit__(self, *exc): self.elapsed = time.perf_counter() - self.t0

def percentile(xs: List[float], p: float) -> float:
    if not xs: return float("nan")
    xs = sorted(xs)
    k = (len(xs) - 1) * (p / 100.0)
    f, c = math.floor(k), math.ceil(k)
    if f == c: return xs[int(k)]
    return xs[f] * (c - k) + xs[c] * (k - f)

def dir_size_bytes(p: Path) -> int:
    total = 0
    for root, _, files in os.walk(p):
        for f in files:
            total += os.path.getsize(os.path.join(root, f))
    return total

# ========= Security (PII) =========
PII_PATTERNS = {
    "EMAIL": re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b"),
    "PHONE": re.compile(r"\b(?:\+?\d{1,3}[\s.-]?)?(?:\(?\d{2,4}\)?[\s.-]?){2,4}\d{2,4}\b"),
    "IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{10,30}\b"),
    "CREDIT_CARD": re.compile(r"\b(?:\d[ -]*?){13,19}\b"),
    # conservative generics; tune per locale
    "GEN_ID": re.compile(r"\b[A-Z]{2}\d{6}[A-Z]{2}\b"),
    "PASSPORT": re.compile(r"\b[A-PR-WY][1-9]\d{6}\b"),
}
def _hash16(s: str) -> str: return hashlib.sha256(("salt:"+s).encode()).hexdigest()[:16]

def scan_pii(text: str) -> Dict[str, int]:
    out = {k: 0 for k in PII_PATTERNS}
    for name, rx in PII_PATTERNS.items():
        out[name] = len(list(rx.finditer(text)))
    out["TOTAL"] = sum(out.values())
    return out

def anonymize_text(text: str) -> Tuple[str, Dict[str, int]]:
    counts = {k: 0 for k in PII_PATTERNS}
    out = text
    for name, rx in PII_PATTERNS.items():
        def repl(m):
            counts[name] += 1
            return f"<{name}_{_hash16(m.group(0))}>"
        out = rx.sub(repl, out)
    counts["TOTAL"] = sum(counts.values())
    return out, counts

# ========= Data models =========
@dataclass
class QueryResultPages:
    lat_ms: float
    q_url: str
    q_chunk: int
    hits: List[Dict[str, Any]]

@dataclass
class LabeledCorpus:
    texts: List[str]
    labels: List[int]
    label_names: List[str]

@dataclass
class QueryResultLabeled:
    lat_ms: float
    ranks: List[int]

# ========= Pages (your flow) pipeline =========
def load_pages_json(path: Path) -> List[Dict[str, str]]:
    with path.open("r") as f:
        return json.load(f)

def chunk_pages(
    pages: List[Dict[str, str]],
    chunk_size: int,
    chunk_overlap: int,
    enable_pii_masking: bool,
    store_raw_hash: bool,
) -> Tuple[List[str], List[Dict[str, Any]], pd.DataFrame]:
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks: List[str] = []; metas: List[Dict[str, Any]] = []; pii_rows: List[Dict[str, Any]] = []
    for p in pages:
        url, text = p.get("url"), p.get("text", "") or ""
        for i, ch in enumerate(splitter.split_text(text)):
            raw = ch
            if enable_pii_masking: ch, pii = anonymize_text(raw)
            else:                  pii = scan_pii(raw)
            meta = {"url": url, "chunk_idx": i}
            if store_raw_hash: meta["raw_hash16"] = _hash16(raw)
            chunks.append(ch); metas.append(meta); pii_rows.append({"url": url, "chunk_idx": i, **pii})
    return chunks, metas, pd.DataFrame(pii_rows)

def build_index_from_texts(
    texts: List[str],
    metadatas: List[Dict[str, Any]],
    persist_dir: Path,
    collection_name: str,
    embedding_model: str,
) -> Chroma:
    emb = OpenAIEmbeddings(model=embedding_model)
    with Timer() as t:
        vs = Chroma.from_texts(
            texts=texts,
            embedding=emb,
            persist_directory=str(persist_dir),
            collection_name=collection_name,
            metadatas=metadatas,
        )
        # Chroma >=0.4 auto-persists; explicit persist() not required
    print(f"Indexed {len(texts)} docs in {t.elapsed:.2f}s ({len(texts)/max(t.elapsed,1e-6):.1f} docs/s)")
    print(f"Disk size: {dir_size_bytes(persist_dir)/(1024*1024):.1f} MB")
    return vs

def single_query_pages(vs: Chroma, query_chunk: str, q_meta: Dict[str, Any], k: int) -> QueryResultPages:
    t0 = time.perf_counter()
    docs = vs.similarity_search_with_score(query_chunk, k=k)
    lat_ms = (time.perf_counter() - t0) * 1000.0
    hits = [{"url": d.metadata.get("url"), "chunk_idx": d.metadata.get("chunk_idx"), "score": s} for d, s in docs]
    return QueryResultPages(lat_ms=lat_ms, q_url=q_meta["url"], q_chunk=int(q_meta["chunk_idx"]), hits=hits)

def compute_pages_quality(results: List[QueryResultPages], ks: List[int]) -> Dict[str, Any]:
    out: Dict[str, Any] = {}
    for k in ks:
        self_hits = 0; url_hits = 0
        for r in results:
            topk = r.hits[:k]
            if any(h["url"] == r.q_url and h["chunk_idx"] == r.q_chunk for h in topk): self_hits += 1
            if any(h["url"] == r.q_url for h in topk): url_hits += 1
        n = len(results) or 1
        out[f"SelfRecall@{k}"] = self_hits / n
        out[f"URLHit@{k}"] = url_hits / n
    return out

def is_allowed_url(url: str, allow_prefixes: List[str]) -> bool:
    return any(url and url.startswith(pref) for pref in allow_prefixes)

def acl_leakage_and_latency(vs: Chroma, sample_query_text: str, allow_prefixes: List[str], k_acl: int = 5) -> Dict[str, Any]:
    with Timer() as t1:
        unfiltered = vs.similarity_search_with_score(sample_query_text, k=k_acl)
    with Timer() as t2:  # app-layer post-filter
        filtered = [pair for pair in unfiltered if is_allowed_url(pair[0].metadata.get("url", ""), allow_prefixes)]
    return {"unfiltered_ms": t1.elapsed * 1000.0, "filtered_ms": (t1.elapsed + t2.elapsed) * 1000.0, "kept": len(filtered)}

# ========= Labeled (optional baseline) =========
def load_newsgroups(train_size: int, test_size: int, seed: int = 42) -> Tuple[LabeledCorpus, LabeledCorpus]:
    if fetch_20newsgroups is None:
        raise RuntimeError("scikit-learn not available; install it or set MODE='pages'")
    random.seed(seed)
    train: Bunch = fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes"))
    test:  Bunch = fetch_20newsgroups(subset="test",  remove=("headers", "footers", "quotes"))
    ti = list(range(len(train.data))); xi = list(range(len(test.data)))
    random.shuffle(ti); random.shuffle(xi)
    ti, xi = ti[:5000], xi[:300]
    return (
        LabeledCorpus([train.data[i] for i in ti], [int(train.target[i]) for i in ti], list(train.target_names)),
        LabeledCorpus([test.data[i]  for i in xi], [int(test.target[i])  for i in xi],  list(test.target_names)),
    )

# ========= Plotting (fixed) =========
def _ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True)

def plot_perf(runs: List[Dict[str, Any]], outdir: Path) -> List[Path]:
    _ensure_dir(outdir)
    conc = [r["concurrency"] for r in runs]
    qps  = [r.get("throughput_qps") for r in runs]
    p50  = [r.get("latency_ms_p50") for r in runs]
    p95  = [r.get("latency_ms_p95") for r in runs]

    # QPS
    plt.figure()
    plt.plot(conc, qps, marker="o")
    plt.xlabel("Concurrency"); plt.ylabel("Throughput (QPS)"); plt.title("Throughput vs Concurrency")
    p1 = outdir / "perf_throughput.png"; plt.savefig(p1, bbox_inches="tight"); plt.close()

    # Latency
    plt.figure()
    plt.plot(conc, p50, marker="o", label="p50")
    plt.plot(conc, p95, marker="o", label="p95")
    plt.xlabel("Concurrency"); plt.ylabel("Latency (ms)"); plt.title("Latency vs Concurrency"); plt.legend()
    p2 = outdir / "perf_latency.png"; plt.savefig(p2, bbox_inches="tight"); plt.close()
    return [p1, p2]

def plot_quality(quality: Dict[str, Any], outdir: Path) -> Path:
    _ensure_dir(outdir)
    keys = list(quality.keys()); vals = [quality[k] for k in keys]
    plt.figure(figsize=(max(6, len(keys)*0.8), 4))
    plt.bar(range(len(keys)), vals)
    plt.xticks(range(len(keys)), keys, rotation=45, ha="right")
    plt.ylim(0, 1.05); plt.ylabel("Score"); plt.title("Quality Metrics")
    p = outdir / "quality.png"; plt.savefig(p, bbox_inches="tight"); plt.close(); return p

def plot_security(pii_summary: Dict[str, Any], leakage_rate: Optional[float], outdir: Path) -> List[Path]:
    _ensure_dir(outdir)
    # Keep only numeric PII fields; drop aggregate/non-PII keys
    clean = {name: val for name, val in pii_summary.items()
             if isinstance(val, (int, float)) and name not in ("TOTAL", "chunk_idx")}
    names = list(clean.keys())
    counts = [clean[name] for name in names]

    # PII bar chart
    plt.figure(figsize=(max(6, len(names)*0.8), 4))
    plt.bar(range(len(names)), counts)
    plt.xticks(range(len(names)), names, rotation=45, ha="right")
    plt.ylabel("Count"); plt.title("PII occurrences by type")
    p1 = outdir / "security_pii.png"; plt.savefig(p1, bbox_inches="tight"); plt.close()

    # ACL leakage “card”
    plt.figure(figsize=(4, 1.5))
    if isinstance(leakage_rate, (int, float)):
        txt = f"ACL leakage rate (top-k): {leakage_rate:.4f}"
    else:
        txt = "ACL leakage rate: n/a"
    plt.text(0.01, 0.5, txt, va="center"); plt.axis("off")
    p2 = outdir / "security_acl.png"; plt.savefig(p2, bbox_inches="tight"); plt.close()
    return [p1, p2]

# ========= LLM explanation/report =========
def _call_llm_explain(summary: Dict[str, Any], model: str) -> str:
    prompt = (
        "You are a critical, no-fluff performance engineer. "
        "Explain the following Chroma retrieval benchmark summary for a technical audience. "
        "Be concise, prioritize strategic insights, call out bottlenecks, and give 3–5 concrete next steps. "
        "Return Markdown. Here is the JSON summary:\n\n"
        f"{json.dumps(summary, indent=2)}"
    )
    try:
        from openai import OpenAI  # new SDK
        client = OpenAI()
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "You are a terse, incisive benchmarking analyst."},
                      {"role": "user", "content": prompt}],
            temperature=0.2,
        )
        return resp.choices[0].message.content.strip()
    except Exception:
        try:
            from langfuse.openai import openai  # shim, if you use it
            resp = openai.chat.completions.create(
                model=model,
                messages=[{"role": "system", "content": "You are a terse, incisive benchmarking analyst."},
                          {"role": "user", "content": prompt}],
                temperature=0.2,
            )
            return resp.choices[0].message.content.strip()
        except Exception as e:
            return f"LLM explanation unavailable. Reason: {e}"

def _img_to_base64(path: Path) -> str:
    with path.open("rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def write_reports(persist_dir: Path, summary: Dict[str, Any], llm_md: str, images: List[Path]) -> tuple[Path, Path]:
    md_path = persist_dir / "benchmark_explanation.md"
    html_path = persist_dir / "benchmark_report.html"
    # Markdown
    md_parts = [
        "# Chroma Benchmark Report", "",
        "## Summary JSON", "```json", json.dumps(summary, indent=2), "```", "",
        "## LLM Explanation", llm_md or "*No explanation available.*", "",
        "## Plots",
    ]
    for img in images:
        md_parts += [f"### {img.name}", f"![{img.name}]({img.name})", ""]
    md_path.write_text("\n".join(md_parts), encoding="utf-8")
    # HTML (inline images)
    img_blocks = []
    for img in images:
        try:
            b64 = _img_to_base64(img)
            img_blocks.append(f'<h3>{img.name}</h3><img alt="{img.name}" src="data:image/png;base64,{b64}" style="max-width: 900px;" />')
        except Exception:
            img_blocks.append(f'<h3>{img.name}</h3><p>(Could not inline; see file on disk)</p>')
    html = f"""<!doctype html>
<html><head><meta charset="utf-8"><title>Chroma Benchmark Report</title></head>
<body>
<h1>Chroma Benchmark Report</h1>
<h2>Summary JSON</h2>
<pre>{json.dumps(summary, indent=2)}</pre>
<h2>LLM Explanation</h2>
<div>{llm_md if llm_md else "<em>No explanation available.</em>"}</div>
<h2>Plots</h2>
{''.join(img_blocks)}
</body></html>"""
    html_path.write_text(html, encoding="utf-8")
    return md_path, html_path

# ========= RUN (pages mode using your parameters) =========
random.seed(42)
persist_dir = Path(PERSIST_DIR_STR); persist_dir.mkdir(parents=True, exist_ok=True)

if MODE == "pages":
    # 1) Load & chunk (your flow)
    pages = load_pages_json(Path(PAGES_FILE))
    chunks, metas, pii_df = chunk_pages(
        pages=pages,
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        enable_pii_masking=ENABLE_PII_MASKING,
        store_raw_hash=STORE_RAW_HASH,
    )
    # 2) Build index
    vs = build_index_from_texts(chunks, metas, persist_dir, COLLECTION, EMBEDDING_MODEL)

    # 3) Perf: run at given concurrencies
    all_idx = list(range(len(chunks))); random.shuffle(all_idx)
    q_idx = all_idx[:min(SAMPLE_QUERIES, len(all_idx))]
    K = max(K_LIST)

    runs = []
    for c in CONCURRENCY:
        results = []
        t0 = time.perf_counter()
        if c <= 1:
            for i in q_idx:
                results.append(single_query_pages(vs, chunks[i], metas[i], K))
        else:
            with ThreadPoolExecutor(max_workers=c) as ex:
                futs = [ex.submit(single_query_pages, vs, chunks[i], metas[i], K) for i in q_idx]
                for fut in as_completed(futs):
                    results.append(fut.result())
        total_s = time.perf_counter() - t0
        lats = [r.lat_ms for r in results]
        run_metrics = {
            "concurrency": c,
            "q_count": len(results),
            "total_time_s": total_s,
            "throughput_qps": len(results)/total_s if total_s else None,
            "latency_ms_p50": statistics.median(lats) if lats else None,
            "latency_ms_p95": percentile(lats, 95) if lats else None,
        }
        print(json.dumps(run_metrics, indent=2))
        runs.append(run_metrics)

    # 4) Quality (label-free) on worst concurrency
    worst_c = CONCURRENCY[-1]
    worst_results = []
    if worst_c <= 1:
        for i in q_idx: worst_results.append(single_query_pages(vs, chunks[i], metas[i], K))
    else:
        with ThreadPoolExecutor(max_workers=worst_c) as ex:
            futs = [ex.submit(single_query_pages, vs, chunks[i], metas[i], K) for i in q_idx]
            for fut in as_completed(futs): worst_results.append(fut.result())
    quality = compute_pages_quality(worst_results, K_LIST)
    print("Quality (label-free):", json.dumps(quality, indent=2))

    # 5) Security
    pii_summary = pii_df.sum(numeric_only=True).to_dict()
    K_ACL = min(5, K)
    leaks, total_checked = 0, 0
    for r in worst_results[:200]:
        topk = r.hits[:K_ACL]
        for h in topk:
            total_checked += 1
            if not is_allowed_url(h["url"] or "", ACL_ALLOW_PREFIXES): leaks += 1
    leakage_rate = (leaks / total_checked) if total_checked else None
    print("ACL leakage:", {"K": K_ACL, "checked": total_checked, "leakage_rate": leakage_rate})

    acl_demo = None
    if worst_results:
        acl_demo = acl_leakage_and_latency(vs, chunks[q_idx[0]], ACL_ALLOW_PREFIXES, k_acl=K_ACL)
        print("ACL latency demo:", json.dumps(acl_demo, indent=2))

    # 6) Optional delete demo
    post_delete_offenders = None
    if DELETE_PREFIX:
        targets = {m["url"] for m in metas if (m.get("url") or "").startswith(DELETE_PREFIX)}
        for url in targets:
            vs._collection.delete(where={"url": url})  # NOTE: private attr; demo-only
        offenders = 0
        for r in worst_results[:100]:
            if any((h["url"] or "").startswith(DELETE_PREFIX) for h in r.hits): offenders += 1
        post_delete_offenders = offenders

    # 7) Summary JSON
    summary = {
        "index": {
            "persist_dir": str(persist_dir),
            "collection": COLLECTION,
            "embedding_model": EMBEDDING_MODEL,
            "disk_mb": round(dir_size_bytes(persist_dir)/(1024*1024), 1),
            "docs": len(chunks),
        },
        "runs": runs,
        "quality": quality,
        "security": {
            "pii_total": int(pii_summary.get("TOTAL", 0)) if pii_summary else 0,
            "pii_breakdown": {k:int(v) for k,v in pii_summary.items() if isinstance(v,(int,float))},
            "acl": {"allow_prefixes": ACL_ALLOW_PREFIXES, "leakage_rate": leakage_rate, "latency_demo": acl_demo},
            "delete_prefix": DELETE_PREFIX, "post_delete_offenders": post_delete_offenders,
        }
    }
else:
    # Optional labeled baseline (not used here)
    train, test = load_newsgroups(5000, 300)
    metas = [{"topic": train.label_names[y]} for y in train.labels]
    vs = build_index_from_texts(train.texts, metas, Path(PERSIST_DIR_STR), COLLECTION, EMBEDDING_MODEL)
    summary = {"note": "labeled mode not executed in this cell"}

# 8) Save summary
out_json = Path(PERSIST_DIR_STR) / "benchmark_summary.json"
out_json.write_text(json.dumps(summary, indent=2), encoding="utf-8")
print(f"\nSaved summary → {out_json}")

# 9) Plots & report
plot_paths: List[Path] = []
if MAKE_PLOTS:
    plot_paths += plot_perf(summary["runs"], Path(PERSIST_DIR_STR))
    if "quality" in summary: plot_paths += [plot_quality(summary["quality"], Path(PERSIST_DIR_STR))]
    if "security" in summary:
        piis = summary["security"].get("pii_breakdown", {})
        leak = summary["security"].get("acl", {}).get("leakage_rate", None)
        plot_paths += plot_security(piis, leak, Path(PERSIST_DIR_STR))
    print(f"Saved plots → {[p.name for p in plot_paths]}")

llm_md = None
md_path = html_path = None
if MAKE_REPORT:
    llm_md = _call_llm_explain(summary, model=LLM_MODEL)
    md_path, html_path = write_reports(Path(PERSIST_DIR_STR), summary, llm_md, plot_paths)
    print(f"Saved LLM report → {md_path}")
    print(f"Saved HTML report → {html_path}")

# 10) Inline display (optional)
if SHOW_INLINE:
    from IPython.display import display, Image, HTML, Markdown
    if plot_paths:
        for p in plot_paths:
            display(Image(filename=str(p)))
    if html_path and html_path.exists():
        display(HTML(html_path.read_text()))
    elif llm_md:
        display(Markdown(llm_md))

Indexed 161 docs in 2.21s (72.8 docs/s)
Disk size: 5.5 MB
{
  "concurrency": 1,
  "q_count": 161,
  "total_time_s": 43.62599518100001,
  "throughput_qps": 3.690460225194329,
  "latency_ms_p50": 203.354830999956,
  "latency_ms_p95": 512.1905859999742
}
{
  "concurrency": 5,
  "q_count": 161,
  "total_time_s": 7.746535279999989,
  "throughput_qps": 20.783485026611825,
  "latency_ms_p50": 201.48165899991,
  "latency_ms_p95": 422.68770099985886
}
{
  "concurrency": 10,
  "q_count": 161,
  "total_time_s": 3.699653967000131,
  "throughput_qps": 43.5175833837636,
  "latency_ms_p50": 196.9967090001319,
  "latency_ms_p95": 333.61633399999846
}
Quality (label-free): {
  "SelfRecall@1": 0.2236024844720497,
  "URLHit@1": 0.84472049689441,
  "SelfRecall@3": 0.8695652173913043,
  "URLHit@3": 0.9627329192546584,
  "SelfRecall@5": 0.9254658385093167,
  "URLHit@5": 0.9875776397515528,
  "SelfRecall@10": 0.9751552795031055,
  "URLHit@10": 1.0
}
ACL leakage: {'K': 5, 'checked': 805, 'leakage_rate': 0.0}
