In [1]:
# bootstrap
import os, sys
from pathlib import Path

ROOT = Path.cwd()
while not (ROOT / "pyproject.toml").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent
os.chdir(ROOT)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print("Project root:", ROOT)

Project root: d:\IIT BBS\Job Resources\Business Optima\new-pdf-agent


In [2]:
# config + doc
from packages.core_config.config import load_yaml

cfg = load_yaml("configs/providers.yaml", "configs/pipelines/generic_legal.yaml")
doc_id = "NFS_2019"

cb = cfg.get("eval", {}).get("closed_book", {})
rag = cfg.get("eval", {}).get("rag", {})
oll = cfg.get("eval_ollama", {})

In [3]:
# Closed-book eval
from packages.eval.closed_book import ClosedBookEvalConfig, run_closed_book

cb_cfg = ClosedBookEvalConfig(
    use_llm=bool(cb.get("use_llm", True)),
    max_questions=int(cb.get("max_questions", 50)),
    datasets_root=str(cfg.get("sft", {}).get("generation", {}).get("datasets_root", "data/datasets")),
    llm_base_url=str(oll.get("base_url", "http://localhost:11434")),
    llm_model=str(oll.get("model", "llama3.2:latest")),
    llm_temperature=float(oll.get("temperature", 0.2)),
    llm_max_new_tokens=int(oll.get("max_new_tokens", 256)),
    connect_timeout=int(oll.get("connect_timeout", 30)),
    read_timeout=int(oll.get("read_timeout", 600)),
    retries=int(oll.get("retries", 1)),
)
metrics_cb = run_closed_book(doc_id, cb_cfg)
metrics_cb

{'n': 50, 'f1': 0.0860385562728328, 'rougeL': 0.06953322003772429}

In [4]:
# RAG eval
from packages.eval.rag_eval import RAGEvalConfig, run_rag_eval

idx_cfg = cfg.get("index", {})  # for bge_use_prompt
rag_cfg = RAGEvalConfig(
    use_llm=bool(rag.get("use_llm", True)),
    max_questions=int(rag.get("max_questions", 50)),
    # retriever
    persist_path="data/artifacts",
    embed_model_or_path=str(cfg.get("embedding", {}).get("model", "BAAI/bge-base-en-v1.5")),
    device=str(cfg.get("embedding", {}).get("device", "cpu")),
    bge_use_prompt=bool(idx_cfg.get("bge_use_prompt", True)),
    top_k=int(rag.get("top_k", 12)),
    rerank_top_k=int(rag.get("rerank_top_k", 8)),
    return_top_k=int(rag.get("return_top_k", 6)),
    # LLM
    llm_base_url=str(oll.get("base_url", "http://localhost:11435")),
    llm_model=str(oll.get("model", "llama3.2:latest")),
    llm_temperature=float(oll.get("temperature", 0.2)),
    llm_max_new_tokens=int(oll.get("max_new_tokens", 256)),
    connect_timeout=int(oll.get("connect_timeout", 30)),
    read_timeout=int(oll.get("read_timeout", 600)),
    retries=int(oll.get("retries", 1)),
)

metrics_rag = run_rag_eval(doc_id, rag_cfg)
metrics_rag

  from tqdm.autonotebook import tqdm, trange


{'n': 50,
 'f1': 0.14187929778946853,
 'rougeL': 0.1314459178645347,
 'retrieval_recall@6': 0.52}

In [5]:
import pandas as pd
pd.DataFrame([{"mode":"closed-book", **metrics_cb}, {"mode":"RAG", **metrics_rag}])

Unnamed: 0,mode,n,f1,rougeL,retrieval_recall@6
0,closed-book,50,0.086039,0.069533,
1,RAG,50,0.141879,0.131446,0.52
