In [2]:
!nvidia-smi || true
!pip -q install faiss-cpu "transformers>=4.41.0" "accelerate>=0.30.0" langchain langchain-community langchain-huggingface

Sat Dec  6 01:56:15 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
from pathlib import Path

VECTORDIR  = Path("/content")
PIPEOUT    = Path("/content")

FAISS_PATH  = VECTORDIR / "faiss_index.bin"
EMB_PATH    = PIPEOUT / "embeddings.npy"
CHUNKS_PATH = PIPEOUT / "chunks_metadata.json"

print("Paths:")
print("  FAISS:", FAISS_PATH)
print("  EMB:  ", EMB_PATH)
print("  CHUNKS:", CHUNKS_PATH)

print("Exists:", FAISS_PATH.exists(), EMB_PATH.exists(), CHUNKS_PATH.exists())
assert FAISS_PATH.exists(), f"Missing FAISS index at {FAISS_PATH}"
assert EMB_PATH.exists(), f"Missing embeddings.npy at {EMB_PATH}"
assert CHUNKS_PATH.exists(), f"Missing chunks_metadata.json at {CHUNKS_PATH}"


Paths:
  FAISS: /content/faiss_index.bin
  EMB:   /content/embeddings.npy
  CHUNKS: /content/chunks_metadata.json
Exists: True True True


In [4]:
import json, numpy as np, faiss

index = faiss.read_index(str(FAISS_PATH))
emb_matrix = np.load(EMB_PATH)  # (N, d)
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    chunks = json.load(f)       # list of {index, text, metadata}

print("Index:", index.ntotal, "vectors; dim:", index.d)
print("Embeddings:", emb_matrix.shape)
print("Chunks:", len(chunks))
assert index.ntotal == emb_matrix.shape[0], "Index count != embeddings rows"
assert index.d == emb_matrix.shape[1], "Index dim != embedding dim"

Index: 12788 vectors; dim: 768
Embeddings: (12788, 768)
Chunks: 12788


In [5]:
!pip -q install -U transformers accelerate



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ===== AUTHENTICATE & LOAD LLAMA-3.2-3B =====
from huggingface_hub import notebook_login
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Step 1: Authenticate (opens login prompt)
#notebook_login()

# Step 2: Load Model
MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
)

print("Loaded:", MODEL_ID, "| device:", next(model.parameters()).device)



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Loaded: meta-llama/Llama-3.2-3B-Instruct | device: cuda:0


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:

prompt = "Answer briefly: What is the liver's main function?"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(next(model.parameters()).device) for k, v in inputs.items()}
out = model.generate(**inputs, max_new_tokens=24, do_sample=False, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(out[0], skip_special_tokens=True))


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer briefly: What is the liver's main function? The liver is a vital organ that performs many functions, but its main function is to detoxify the body by removing toxins


In [11]:
!pip -q install langchain-huggingface
from langchain_huggingface import HuggingFaceEmbeddings
import numpy as np, torch

embedder = HuggingFaceEmbeddings(
    model_name="pritamdeka/S-BlueBERT-snli-multinli-stsb",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True, "batch_size": 32}
)

def embed_query(text: str) -> np.ndarray:
    v = embedder.embed_query(text)
    return np.asarray(v, dtype="float32").reshape(1, -1)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
TOP_K = 3  # keep small for speed

def search_topk(query: str, k: int = TOP_K):
    q = embed_query(query)
    D, I = index.search(q, k)
    out = []
    for idx, dist in zip(I[0], D[0]):
        if int(idx) < len(chunks):
            item = dict(chunks[int(idx)])
            item["_score"] = float(dist)
            out.append(item)
    return out

def render_context(selected, max_chars=900):  # cap for speed
    parts = []
    for r in selected:
        md = r.get("metadata", {})
        heading = md.get("heading", "Unknown")
        source  = md.get("source", "Unknown")
        text    = (r.get("text") or r.get("page_content", ""))[:max_chars]
        parts.append(f"[Section: {heading} | Source: {source}]\n{text}")
    return "\n\n".join(parts)


In [13]:
from textwrap import dedent
import time

TEMPERATURE = 0.1
MAX_NEW_TOKENS = 128  # raise to 256–384 if stable

def build_prompt(context: str, question: str) -> str:
    return dedent(f"""\
    You are a medical information assistant. Answer the question ONLY using the context below.
    If the answer is not present, say: "This information is not available in the provided documents."
    Do not provide diagnosis or treatment advice. Be concise and include sources.

    Context:
    {context}

    Question: {question}

    Answer:
    """)

def generate_answer(question: str):
    t0 = time.time()
    docs = search_topk(question, TOP_K)
    t1 = time.time()
    if not docs:
        return {"answer": "This information is not available in the provided documents.",
                "sources": [], "used_k": 0, "timings": {"retrieval_s": t1 - t0, "gen_s": 0.0}}

    ctx = render_context(docs)
    prompt = build_prompt(ctx, question)

    inputs = tokenizer(prompt, return_tensors="pt")
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    t2 = time.time()
    gen = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    t3 = time.time()

    output = tokenizer.decode(gen[0], skip_special_tokens=True)
    ans = output.split("Answer:", 1)[1].strip() if "Answer:" in output else output.strip()

    sources = [{"heading": d.get("metadata", {}).get("heading", "Unknown"),
                "source": d.get("metadata", {}).get("source", "Unknown")} for d in docs]
    return {"answer": ans, "sources": sources, "used_k": len(docs), "timings": {"retrieval_s": t1 - t0, "gen_s": t3 - t2}}


In [15]:

print("\n" + "="*100)
print("CHUNK STRUCTURE ANALYSIS")
print("="*100)

print(f"\nTotal chunks: {len(chunks)}")
print(f"\nFirst chunk:")
print(f"  Keys: {chunks[0].keys()}")
print(f"\n  Full first chunk:")
for key, value in chunks[0].items():
    if isinstance(value, dict):
        print(f"    {key}:")
        for k, v in value.items():
            if isinstance(v, str) and len(v) > 50:
                print(f"      {k}: {v[:50]}... (len: {len(v)})")
            else:
                print(f"      {k}: {v}")
    else:
        if isinstance(value, str) and len(value) > 50:
            print(f"    {key}: {value[:50]}... (len: {len(value)})")
        else:
            print(f"    {key}: {value}")

# ===== NOW TEST SIMPLE RETRIEVER =====
print("\n" + "="*100)
print("TEST: Simple search_topk()")
print("="*100)

test_question = "How is hepatic fibrosis assessed?"

print(f"\nQuestion: {test_question}")
print(f"Searching with TOP_K={TOP_K}...")

docs = search_topk(test_question, k=TOP_K)

print(f"\nRetrieved {len(docs)} documents")

for i, doc in enumerate(docs, 1):
    heading = doc.get("metadata", {}).get("heading", "Unknown")
    source = doc.get("metadata", {}).get("source", "Unknown")
    score = doc.get("_score", 0)
    text_field = doc.get("text") or doc.get("page_content", "")
    text_len = len(text_field) if text_field else 0

    print(f"\n{i}. {heading}")
    print(f"   Source: {source}")
    print(f"   Score: {score:.4f}")
    print(f"   Text length: {text_len} chars")
    if text_field:
        print(f"   Text preview: {text_field[:100]}...")
    else:
        print(f"   NO TEXT CONTENT - Only has metadata")

# ===== TEST render_context =====
print("\n" + "="*100)
print("TEST: render_context()")
print("="*100)

context = render_context(docs)
print(f"\nRendered context length: {len(context)} chars")
print(f"\nContext preview:")
print(context[:300])

# ===== TEST generate_answer =====
print("\n" + "="*100)
print("TEST: generate_answer()")
print("="*100)

result = generate_answer(test_question)

print(f"\nGenerated answer")
print(f"  Used K: {result['used_k']}")
print(f"  Retrieval time: {result['timings']['retrieval_s']:.3f}s")
print(f"  Generation time: {result['timings']['gen_s']:.3f}s")
print(f"\n  Answer:")
print(f"  {result['answer']}")
print(f"\n  Sources:")
for src in result['sources']:
    print(f"    - {src['heading']} ({src['source']})")

print("\n" + "="*100)


CHUNK STRUCTURE ANALYSIS

Total chunks: 12788

First chunk:
  Keys: dict_keys(['index', 'text', 'metadata'])

  Full first chunk:
    index: 0
    text: Chronic liver disease (CLD) leads to liver fibrosi... (len: 422)
    metadata:
      file_id: 0550de0db30051568c6588fc5982cc974167d2a7209d0e68a4... (len: 64)
      page_title: AASLD Practice Guideline on blood-based noninvasiv... (len: 110)
      page_url: https://journals.lww.com/hep/citation/9900/aasld_p... (len: 91)
      source: 0550de0db30051568c6588fc5982cc974167d2a7209d0e68a4... (len: 77)
      section_index: 2
      heading: PURPOSE AND SCOPE
      level: 2
      file_path: cleaned_data/cleaned_data/0550de0db30051568c6588fc... (len: 103)
      char_count: 10594

TEST: Simple search_topk()

Question: How is hepatic fibrosis assessed?
Searching with TOP_K=3...

Retrieved 3 documents

1. Prognosis.
   Source: dedbaae96e4582222667a55c1a3e63917ef3d786e15fc348409c48687724fd08_cleaned.json
   Score: 0.7280
   Text length: 337 chars
 

In [18]:
# ============================================================================
# CELL: Multi-Query v2 — Generate independent questions + retrieve + answer
# ============================================================================

from textwrap import dedent
from typing import List, Dict, Tuple
import numpy as np

def generate_independent_questions(user_question: str, n: int = 3) -> List[str]:
    """
    Generate N independent follow-up questions that an LLM would need answered
    to fully address the user's original question. These are NOT paraphrases,
    but semantically related sub-questions.
    """
    prompt = dedent(f"""\
    Given this medical question about AASLD liver disease guidelines:
    "{user_question}"

    What are {n} other related questions that would help fully answer the above?
    Generate standalone, independent questions (not paraphrases).
    Do NOT include the original question.
    List one per line, no numbering, no explanations.

    Related questions:
    """)

    inputs = tokenizer(prompt, return_tensors="pt")
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    out_ids = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.8,      # higher for diversity
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    out = tokenizer.decode(out_ids[0], skip_special_tokens=True)

    # Parse lines after "Related questions:"
    part = out.split("Related questions:", 1)[-1]
    lines = [ln.strip("•- ").strip() for ln in part.strip().split("\n") if ln.strip()]

    # Keep unique, non-empty lines
    result = []
    for ln in lines:
        if ln and len(ln) > 10 and ln not in result:  # filter out garbage
            result.append(ln)
        if len(result) >= n:
            break

    return result or []


def search_topk_for_text(qtext: str, k: int) -> Tuple[np.ndarray, np.ndarray]:
    """Embed a text query and search FAISS."""
    q = embed_query(qtext)
    D, I = index.search(q, k)
    return D[0], I[0]


def multiquery_retrieve_v2(question: str, k_per_query: int = 3, n_related: int = 3, max_total: int = 8) -> List[Dict]:
    """
    Retrieve for the original question + independently generated related questions.
    Deduplicate and merge results.

    FIXED: Now uses "text" field (actual chunk content from your chunks structure)
    """
    all_questions = [question]

    # Generate independent follow-up questions
    related = generate_independent_questions(question, n=n_related)
    print(f"[DEBUG] Generated {len(related)} related questions:")
    for rq in related:
        print(f"  - {rq}")
    all_questions.extend(related)

    # Search for all questions
    seen = set()
    merged: List[Tuple[int, float]] = []

    for q in all_questions:
        D, I = search_topk_for_text(q, k_per_query)
        for dist, idx in zip(D, I):
            idx = int(idx)
            if 0 <= idx < len(chunks) and idx not in seen:
                seen.add(idx)
                merged.append((idx, float(dist)))

    # Sort by score (higher is better)
    merged.sort(key=lambda x: x[1], reverse=True)

    # Keep top max_total
    selected = []
    for idx, score in merged[:max_total]:
        item = dict(chunks[idx])  # Get full chunk
        item["_score"] = score
        item["_idx"] = idx

        if "text" not in item:
            # Fallback: if text missing, use metadata heading
            item["text"] = item.get("metadata", {}).get("heading", "")

        selected.append(item)

    return selected


def generate_answer_multiquery_v2(question: str, k_per_query: int = 3, n_related: int = 3, max_total: int = 8):
    """
    Multi-Query v2: generate independent questions, retrieve for all, then answer.

    FIXED: Sources now include "text" field for evaluation metrics
    """
    import time

    t0 = time.time()
    docs = multiquery_retrieve_v2(question, k_per_query=k_per_query, n_related=n_related, max_total=max_total)
    t1 = time.time()

    if not docs:
        return {
            "answer": "This information is not available in the provided documents.",
            "sources": [],
            "used_k": 0,
            "timings": {"retrieval_s": t1 - t0, "gen_s": 0.0}
        }

    ctx = render_context(docs, max_chars=1000)
    prompt = build_prompt(ctx, question)

    inputs = tokenizer(prompt, return_tensors="pt")
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    t2 = time.time()
    out_ids = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    t3 = time.time()

    out = tokenizer.decode(out_ids[0], skip_special_tokens=True)
    ans = out.split("Answer:", 1)[1].strip() if "Answer:" in out else out.strip()

    sources = []
    for d in docs:
        source_dict = {
            "heading": d.get("metadata", {}).get("heading", "Unknown"),
            "source": d.get("metadata", {}).get("source", "Unknown"),
            "text": d.get("text", ""),
            "metadata": d.get("metadata", {}),
            "_score": d.get("_score", 0),
            "_idx": d.get("_idx", -1)
        }
        sources.append(source_dict)

    return {
        "answer": ans,
        "sources": sources,
        "used_k": len(docs),
        "timings": {"retrieval_s": t1 - t0, "gen_s": t3 - t2}
    }


In [22]:

test_q = "How is hepatic fibrosis assessed?"

print("\n" + "="*100)
print("TEST: generate_answer_multiquery_v2")
print("="*100)

result = generate_answer_multiquery_v2(test_q, k_per_query=2, n_related=2, max_total=5)

print(f"\nAnswer generated")
print(f"  Used K: {result['used_k']}")
print(f"  Retrieval time: {result['timings']['retrieval_s']:.3f}s")
print(f"  Generation time: {result['timings']['gen_s']:.3f}s")

print(f"\nSources returned: {len(result['sources'])}")

for i, src in enumerate(result['sources'], 1):
    heading = src.get("heading", "Unknown")
    text_len = len(src.get("text", ""))
    score = src.get("_score", 0)

    print(f"\n{i}. {heading}")
    print(f"   ├─ Text length: {text_len} chars")
    print(f"   ├─ Score: {score:.4f}")

    if text_len > 100:
        print(f"   ├─ Text preview: {src['text'][:100]}...")
        print(f"   └─ GOOD - Text field present")
    else:
        print(f"   └─ WARNING - Text too short or missing")

print(f"\nAnswer preview:")
print(f"  {result['answer'][:200]}...")

print("="*100)



TEST: generate_answer_multiquery_v2
[DEBUG] Generated 2 related questions:
  - What is the purpose of the FIB-4 index?
  - What is the role of transient elastography in liver disease assessment?

Answer generated
  Used K: 5
  Retrieval time: 2.344s
  Generation time: 25.679s

Sources returned: 5

1. Prognosis.
   ├─ Text length: 337 chars
   ├─ Score: 0.7280
   ├─ Text preview: A further important use of liver biopsy is in assessing disease severity, notably fibrosis, which, a...
   └─ GOOD - Text field present

2. US-based elastography
   ├─ Text length: 426 chars
   ├─ Score: 0.7150
   ├─ Text preview: estimation of fibrosis Hepatic infiltration47LSMAmyloid or tumoral infiltration results in increased...
   └─ GOOD - Text field present

3. Abstract
   ├─ Text length: 442 chars
   ├─ Score: 0.6830
   ├─ Text preview: Background and Aims:Transient elastography (TE), shear wave elastography, and/or magnetic resonance ...
   └─ GOOD - Text field present

4. Background and Aims:
   ├─ T

In [25]:
# ============================================================================
# CELL: Re-Ranker — Cross-Encoder re-scoring of retrieved chunks
# ============================================================================

# Install cross-encoder model (lightweight, runs fast on CPU or GPU)
!pip -q install sentence-transformers

from sentence_transformers import CrossEncoder
import torch
from typing import List, Dict
import time

# Load a lightweight cross-encoder fine-tuned for semantic relevance
# "cross-encoder/ms-marco-MiniLM-L-6-v2" is fast and accurate for Q&D ranking
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", max_length=512)
print("Re-Ranker model loaded.")


def rerank_docs(question: str, docs: List[Dict], top_k: int = 5) -> List[Dict]:
    """
    Re-rank retrieved documents using a cross-encoder.
    Input: question (str) and list of retrieved chunk dicts
    Output: top_k re-ranked chunks sorted by relevance score
    """
    if not docs:
        return []

    chunk_texts = []
    for d in docs:
        text = d.get("text", "")
        # Fallback to other fields if needed
        if not text:
            text = d.get("page_content", "")
        if not text:
            text = d.get("metadata", {}).get("heading", "")
        chunk_texts.append(text)

    # Prepare pairs: (question, chunk_text) for each chunk
    pairs = [[question, text] for text in chunk_texts]

    # Get relevance scores from cross-encoder
    # Scores are typically in [0, 1] where 1 = most relevant
    scores = reranker.predict(pairs, show_progress_bar=False)

    # Attach scores to docs and sort by score descending
    scored_docs = []
    for doc, score in zip(docs, scores):
        doc_copy = dict(doc)
        doc_copy["rerank_score"] = float(score)
        scored_docs.append(doc_copy)

    # Sort by rerank score (descending) and keep top_k
    scored_docs.sort(key=lambda x: x["rerank_score"], reverse=True)

    print(f"   Re-ranked {len(docs)} docs → top {min(top_k, len(scored_docs))}")
    for i, d in enumerate(scored_docs[:top_k]):
        heading = d.get("metadata", {}).get("heading", "Unknown")
        score = d.get("rerank_score", 0)
        print(f"     {i+1}. {heading[:50]}... (score: {score:.3f})")

    return scored_docs[:top_k]

print("rerank_docs() ")


def generate_answer_with_rerank(question: str, k_per_query: int = 3, n_related: int = 3,
                                max_retrieved: int = 10, rerank_top_k: int = 5):


    # Step 1: Multi-Query retrieval (get more candidates)
    print(f"\n[STEP 1] Multi-Query Retrieval...")
    t0 = time.time()
    docs = multiquery_retrieve_v2(question, k_per_query=k_per_query, n_related=n_related, max_total=max_retrieved)
    t_retrieve = time.time() - t0

    if not docs:
        return {
            "answer": "This information is not available in the provided documents.",
            "sources": [],
            "used_k": 0,
            "timings": {"retrieval_s": t_retrieve, "rerank_s": 0.0, "gen_s": 0.0}
        }

    print(f"  Retrieved {len(docs)} candidates in {t_retrieve:.3f}s")

    # Step 2: Re-rank to get top_k most relevant
    print(f"\n[STEP 2] Re-Ranking with Cross-Encoder...")
    t1 = time.time()
    docs_reranked = rerank_docs(question, docs, top_k=rerank_top_k)
    t_rerank = time.time() - t1

    print(f"  Re-ranking completed in {t_rerank:.3f}s")

    if not docs_reranked:
        return {
            "answer": "This information is not available in the provided documents.",
            "sources": [],
            "used_k": 0,
            "timings": {"retrieval_s": t_retrieve, "rerank_s": t_rerank, "gen_s": 0.0}
        }

    # Step 3: Build context from re-ranked docs and generate
    print(f"\n[STEP 3] Generating Answer...")
    ctx = render_context(docs_reranked, max_chars=1000)
    prompt = build_prompt(ctx, question)

    inputs = tokenizer(prompt, return_tensors="pt")
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    t2 = time.time()
    out_ids = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    t_gen = time.time() - t2

    print(f"  Answer generated in {t_gen:.3f}s")

    out = tokenizer.decode(out_ids[0], skip_special_tokens=True)
    ans = out.split("Answer:", 1)[1].strip() if "Answer:" in out else out.strip()

    sources = []
    for d in docs_reranked:
        source_dict = {
            "heading": d.get("metadata", {}).get("heading", "Unknown"),
            "source": d.get("metadata", {}).get("source", "Unknown"),
            "rerank_score": d.get("rerank_score", "N/A"),
            "text": d.get("text", ""),
            "metadata": d.get("metadata", {})
        }
        sources.append(source_dict)

    return {
        "answer": ans,
        "sources": sources,
        "used_k": len(docs_reranked),
        "timings": {"retrieval_s": t_retrieve, "rerank_s": t_rerank, "gen_s": t_gen}
    }

print("generate_answer_with_rerank()")
print("\nRe-Ranker pipeline ready.")


Re-Ranker model loaded.
rerank_docs() 
generate_answer_with_rerank()

Re-Ranker pipeline ready.


In [26]:
test_q = "How is hepatic fibrosis assessed?"

print("\n" + "="*100)
print("TEST: generate_answer_with_rerank()")
print("="*100)

result = generate_answer_with_rerank(test_q, k_per_query=3, n_related=3, max_retrieved=10, rerank_top_k=5)

print(f"\nAnswer generated")
print(f"Answer: {result['answer'][:200]}...")
print(f"\nSources (with rerank scores and text):")
for i, src in enumerate(result['sources'], 1):
    print(f"{i}. {src['heading']}")
    print(f"   Rerank score: {src['rerank_score']:.3f}")
    print(f"   Text length: {len(src['text'])} chars")

print(f"\nTiming:")
print(f"  Retrieval: {result['timings']['retrieval_s']:.3f}s")
print(f"  Reranking: {result['timings']['rerank_s']:.3f}s")
print(f"  Generation: {result['timings']['gen_s']:.3f}s")

print("="*100)



TEST: generate_answer_with_rerank()

[STEP 1] Multi-Query Retrieval...
[DEBUG] Generated 3 related questions:
  - What are the staging criteria for hepatitis C virus (HCV) infection?
  - How is liver biopsy interpreted in the context of the American Association for the Study of Liver Diseases (AASLD) staging system?
  - What is the role of non-invasive assessment in the diagnosis and management of liver fibrosis?
  Retrieved 10 candidates in 5.523s

[STEP 2] Re-Ranking with Cross-Encoder...
   Re-ranked 10 docs → top 5
     1. BACKGROUND... (score: 4.176)
     2. Introduction... (score: 3.025)
     3. Prognosis.... (score: 2.496)
     4. Conclusions:... (score: 2.294)
     5. US-based elastography... (score: 2.175)
  Re-ranking completed in 0.072s

[STEP 3] Generating Answer...
  Answer generated in 21.040s

Answer generated
Answer: Hepatic fibrosis assessment is typically done using noninvasive techniques such as imaging, blood tests, or liver stiffness measurement by fibroscan. Fibr

In [27]:
# ======================================
# PRECISION@K METRIC - for Multi-Query
# ======================================
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def metric_precision_multiquery(question: str, k=5, relevance_threshold=0.65):
    """
    PRECISION@K: Out of top-K retrieved documents, how many are relevant?

    For Multi-Query Retriever ONLY
    - Gets docs directly from multiquery_retrieve_v2()
    - Uses "text" field (actual chunk content)
    - Calculates semantic similarity

    Args:
        question: Query string
        k: Number of top documents to evaluate
        relevance_threshold: Similarity > threshold = relevant (default 0.65)

    Returns:
        Dict with precision score and detailed breakdown
    """

    docs = multiquery_retrieve_v2(question, k_per_query=3, n_related=3, max_total=k)

    if not docs:
        return {
            'metric_name': 'Precision@K (Multi-Query)',
            'precision_at_k': 0.0,
            'error': 'No docs retrieved',
            'k': k
        }

    # Extract text content from docs
    doc_texts = []
    doc_headings = []
    doc_sources = []

    for doc in docs[:k]:
        # Get text field from chunk
        text = doc.get("text", "")

        # Get metadata
        metadata = doc.get("metadata", {})
        heading = metadata.get("heading", "Unknown")
        source = metadata.get("source", "Unknown")

        doc_texts.append(text)
        doc_headings.append(heading)
        doc_sources.append(source)

    # Verify we have text content
    text_lengths = [len(text) for text in doc_texts]
    print(f"\n   Retrieved {len(docs[:k])} documents from multi-query")
    print(f"   Text lengths: {text_lengths} chars")

    if not any(doc_texts):
        return {
            'metric_name': 'Precision@K (Multi-Query)',
            'precision_at_k': 0.0,
            'error': 'No text content in retrieved docs',
            'k': k
        }

    # Embed question and documents
    try:
        question_embedding = embedder.embed_query(question)
        doc_embeddings = embedder.embed_documents(doc_texts)
        doc_embeddings = np.array(doc_embeddings, dtype=np.float32)

        # Calculate semantic similarity between question and doc text
        similarities = cosine_similarity([question_embedding], doc_embeddings)[0]

    except Exception as e:
        return {
            'metric_name': 'Precision@K (Multi-Query)',
            'precision_at_k': 0.0,
            'error': f'Embedding error: {str(e)}',
            'k': k
        }

    # Count relevant docs (similarity > threshold)
    relevant_count = sum(1 for sim in similarities if sim > relevance_threshold)
    precision = relevant_count / k if k > 0 else 0.0

    return {
        'metric_name': 'Precision@K (Multi-Query)',
        'precision_at_k': float(precision),
        'relevant_docs': relevant_count,
        'total_k': k,
        'threshold': relevance_threshold,
        'doc_headings': doc_headings,
        'doc_sources': doc_sources,
        'doc_text_lengths': text_lengths,
        'similarities': [float(s) for s in similarities]
    }

print("METRIC: Precision@K (Multi-Query)")


METRIC: Precision@K (Multi-Query)


In [28]:
# TEST: Precision with Multi-Query Retriever
test_q = "How is hepatic fibrosis assessed?"

print("\n" + "="*100)
print("PRECISION@K - Multi-Query Retriever (CORRECTED)")
print("="*100)
print(f"Question: {test_q}\n")

result = metric_precision_multiquery(test_q, k=5, relevance_threshold=0.65)

print(f"\n" + "-"*100)
print("RESULTS")
print("-"*100)
print(f"Precision@5: {result['precision_at_k']:.4f}")
print(f"Relevant docs: {result['relevant_docs']}/{result['total_k']}")

print(f"\n" + "-"*100)
print("DETAILED BREAKDOWN")
print("-"*100)

for i, (heading, text_len, sim) in enumerate(zip(
    result['doc_headings'],
    result['doc_text_lengths'],
    result['similarities']
), 1):
    is_relevant = "RELEVANT" if sim > result['threshold'] else "NOT RELEVANT"
    print(f"{i}. {heading}")
    print(f"   ├─ Text length: {text_len} chars")
    print(f"   ├─ Similarity: {sim:.4f}")
    print(f"   └─ {is_relevant}")

print(f"\n" + "-"*100)
print("ASSESSMENT")
print("-"*100)

if result['precision_at_k'] > 0.8:
    print(f"EXCELLENT - {result['precision_at_k']*100:.1f}% relevant")
elif result['precision_at_k'] > 0.6:
    print(f"GOOD - {result['precision_at_k']*100:.1f}% relevant")
elif result['precision_at_k'] > 0.4:
    print(f"FAIR - {result['precision_at_k']*100:.1f}% relevant")
else:
    print(f"POOR - {result['precision_at_k']*100:.1f}% relevant")

print("="*100 + "\n")



PRECISION@K - Multi-Query Retriever (CORRECTED)
Question: How is hepatic fibrosis assessed?

[DEBUG] Generated 3 related questions:
  - What are the limitations of transient elastography for assessing hepatic fibrosis?
  - How is the diagnosis of hepatic fibrosis confirmed?
  - What is the role of liver biopsy in the assessment of hepatic fibrosis?

   Retrieved 5 documents from multi-query
   Text lengths: [76, 150, 337, 426, 375] chars

----------------------------------------------------------------------------------------------------
RESULTS
----------------------------------------------------------------------------------------------------
Precision@5: 1.0000
Relevant docs: 5/5

----------------------------------------------------------------------------------------------------
DETAILED BREAKDOWN
----------------------------------------------------------------------------------------------------
1. Step 1: Determine the stage of fibrosis
   ├─ Text length: 76 chars
   ├─ Similari

In [29]:
def metric_precision_reranker(question: str, k=5, relevance_threshold=0.65):
    """
    PRECISION@K: Out of top-K retrieved documents, how many are relevant?

    For Re-Ranker Pipeline ONLY
    - Gets docs directly from generate_answer_with_rerank()
    - Uses "text" field (actual chunk content)
    - Calculates semantic similarity

    Args:
        question: Query string
        k: Number of top documents to evaluate
        relevance_threshold: Similarity > threshold = relevant (default 0.65)

    Returns:
        Dict with precision score and detailed breakdown
    """

    # ===== Get docs from re-ranker pipeline =====
    result = generate_answer_with_rerank(question, k_per_query=3, n_related=3,
                                         max_retrieved=10, rerank_top_k=k)

    sources = result.get("sources", [])
    timings = result.get("timings", {})

    if not sources:
        return {
            'metric_name': 'Precision@K (Re-Ranker)',
            'precision_at_k': 0.0,
            'error': 'No docs retrieved',
            'k': k
        }

    # Extract text content from sources
    doc_texts = []
    doc_headings = []
    doc_sources = []
    rerank_scores = []

    for src in sources[:k]:
        # Get text field
        text = src.get("text", "")

        heading = src.get("heading", "Unknown")
        source = src.get("source", "Unknown")
        rerank_score = src.get("rerank_score", "N/A")

        doc_texts.append(text)
        doc_headings.append(heading)
        doc_sources.append(source)
        rerank_scores.append(rerank_score)

    # Verify we have text content
    text_lengths = [len(text) for text in doc_texts]
    print(f"\n   Retrieved {len(sources[:k])} re-ranked documents")
    print(f"   Text lengths: {text_lengths} chars")

    if not any(doc_texts):
        return {
            'metric_name': 'Precision@K (Re-Ranker)',
            'precision_at_k': 0.0,
            'error': 'No text content in retrieved docs',
            'k': k
        }

    # Embed question and documents
    try:
        question_embedding = embedder.embed_query(question)
        doc_embeddings = embedder.embed_documents(doc_texts)
        doc_embeddings = np.array(doc_embeddings, dtype=np.float32)

        # Calculate semantic similarity
        similarities = cosine_similarity([question_embedding], doc_embeddings)[0]

    except Exception as e:
        return {
            'metric_name': 'Precision@K (Re-Ranker)',
            'precision_at_k': 0.0,
            'error': f'Embedding error: {str(e)}',
            'k': k
        }

    # Count relevant docs
    relevant_count = sum(1 for sim in similarities if sim > relevance_threshold)
    precision = relevant_count / k if k > 0 else 0.0

    return {
        'metric_name': 'Precision@K (Re-Ranker)',
        'precision_at_k': float(precision),
        'relevant_docs': relevant_count,
        'total_k': k,
        'threshold': relevance_threshold,
        'doc_headings': doc_headings,
        'doc_sources': doc_sources,
        'doc_text_lengths': text_lengths,
        'rerank_scores': rerank_scores,
        'semantic_similarities': [float(s) for s in similarities],
        'timings': timings
    }

print("METRIC: Precision@K (Re-Ranker) - READY")

METRIC: Precision@K (Re-Ranker) - READY


In [30]:
# TEST: Precision with Re-Ranker
test_q = "How is hepatic fibrosis assessed?"

print("\n" + "="*100)
print("PRECISION@K - Re-Ranker Version")
print("="*100)
print(f"Question: {test_q}\n")

result = metric_precision_reranker(test_q, k=5, relevance_threshold=0.65)

print(f"\n" + "-"*100)
print("RESULTS")
print("-"*100)
print(f"Precision@5: {result['precision_at_k']:.4f}")
print(f"Relevant docs: {result['relevant_docs']}/{result['total_k']}")

print(f"\n" + "-"*100)
print("DETAILED BREAKDOWN (With Rerank Scores)")
print("-"*100)

for i, (heading, rerank_score, semantic_sim, text_len) in enumerate(zip(
    result['doc_headings'],
    result['rerank_scores'],
    result['semantic_similarities'],
    result['doc_text_lengths']
), 1):
    is_relevant = "RELEVANT" if semantic_sim > result['threshold'] else "NOT RELEVANT"

    print(f"\n{i}. {heading}")
    print(f"   ├─ Rerank score: {rerank_score:.3f} (Cross-Encoder)")
    print(f"   ├─ Semantic similarity: {semantic_sim:.4f} (Question-Doc)")
    print(f"   ├─ Text length: {text_len} chars")
    print(f"   └─ {is_relevant}")

print(f"\n" + "-"*100)
print("TIMING BREAKDOWN")
print("-"*100)
print(f"Retrieval: {result['timings'].get('retrieval_s', 0):.3f}s")
print(f"Reranking: {result['timings'].get('rerank_s', 0):.3f}s")
print(f"Generation: {result['timings'].get('gen_s', 0):.3f}s")
total_time = (result['timings'].get('retrieval_s', 0) +
              result['timings'].get('rerank_s', 0) +
              result['timings'].get('gen_s', 0))
print(f"Total: {total_time:.3f}s")

print(f"\n" + "-"*100)
print("ASSESSMENT")
print("-"*100)

if result['precision_at_k'] > 0.8:
    print(f"EXCELLENT - {result['precision_at_k']*100:.1f}% of docs are relevant")
elif result['precision_at_k'] > 0.6:
    print(f"GOOD - {result['precision_at_k']*100:.1f}% of docs are relevant")
elif result['precision_at_k'] > 0.4:
    print(f"FAIR - {result['precision_at_k']*100:.1f}% of docs are relevant")
else:
    print(f"POOR - {result['precision_at_k']*100:.1f}% of docs are relevant")

print("="*100 + "\n")


PRECISION@K - Re-Ranker Version
Question: How is hepatic fibrosis assessed?


[STEP 1] Multi-Query Retrieval...
[DEBUG] Generated 3 related questions:
  - How is the diagnosis of cirrhosis determined?
  - What is the role of elastography in liver disease management?
  - What are the key factors influencing the development and progression of hepatic fibrosis?
  Retrieved 10 candidates in 2.032s

[STEP 2] Re-Ranking with Cross-Encoder...
   Re-ranked 10 docs → top 5
     1. Prognosis.... (score: 2.496)
     2. Conclusions:... (score: 2.294)
     3. US-based elastography... (score: 2.175)
     4. Background and Aims:... (score: 0.145)
     5. Conclusions:... (score: -0.127)
  Re-ranking completed in 0.021s

[STEP 3] Generating Answer...
  Answer generated in 7.623s

   Retrieved 5 re-ranked documents
   Text lengths: [337, 375, 426, 423, 379] chars

----------------------------------------------------------------------------------------------------
RESULTS
-------------------------------

In [31]:
# ======================================
# METRIC 2 (MULTIQUERY): RECALL@K
# ======================================
def metric_2_recall_multiquery(question: str, all_chunks: list, k=5, relevance_threshold=0.65):
    """
    RECALL@K: Out of ALL relevant docs in database, how many did we find in top-K?
    Uses multiquery_retrieve_v2 (WITHOUT re-ranker)
    """

    # Get docs from multiquery retriever
    retrieved_docs = multiquery_retrieve_v2(question, k_per_query=3, n_related=3, max_total=k)

    if not all_chunks:
        return {
            'recall_at_k': 0.0,
            'found': 0,
            'total_in_db': 0,
            'missed': 0,
            'error': 'all_chunks is empty'
        }

    if not retrieved_docs:
        return {
            'recall_at_k': 0.0,
            'found': 0,
            'total_in_db': 0,
            'missed': 0,
            'error': 'No docs retrieved'
        }

    try:
        question_embedding = embedder.embed_query(question)
    except Exception as e:
        return {'recall_at_k': 0.0, 'found': 0, 'total_in_db': 0, 'missed': 0, 'error': str(e)}

    # ===== Find ALL relevant docs in ENTIRE database =====
    print(f"   Scanning all {len(all_chunks)} chunks...")

    all_doc_texts = [chunk.get('text', chunk.get('page_content', chunk.get('content', ''))) for chunk in all_chunks]

    try:
        all_doc_embeddings = embedder.embed_documents(all_doc_texts)
        all_doc_embeddings = np.array(all_doc_embeddings, dtype=np.float32)
        all_similarities = cosine_similarity([question_embedding], all_doc_embeddings)[0]

        # Count total relevant in entire database
        total_relevant_in_db = sum(1 for sim in all_similarities if sim > relevance_threshold)

    except Exception as e:
        return {'recall_at_k': 0.0, 'found': 0, 'total_in_db': 0, 'missed': 0, 'error': f'Embedding error: {str(e)}'}

    if total_relevant_in_db == 0:
        return {
            'recall_at_k': 0.0,
            'found': 0,
            'total_in_db': 0,
            'missed': 0,
            'note': 'No relevant docs in database at threshold'
        }

    # ===== Count relevant docs in TOP-K retrieved =====
    top_k_docs = retrieved_docs[:k]
    top_k_texts = [doc.get('text', doc.get('page_content', doc.get('content', ''))) for doc in top_k_docs]

    try:
        top_k_embeddings = embedder.embed_documents(top_k_texts)
        top_k_embeddings = np.array(top_k_embeddings, dtype=np.float32)
        top_k_similarities = cosine_similarity([question_embedding], top_k_embeddings)[0]

        relevant_in_top_k = sum(1 for sim in top_k_similarities if sim > relevance_threshold)

    except Exception as e:
        return {'recall_at_k': 0.0, 'found': 0, 'total_in_db': total_relevant_in_db, 'missed': total_relevant_in_db, 'error': f'Top-K embedding error: {str(e)}'}

    # RECALL = found / total_relevant
    recall = relevant_in_top_k / total_relevant_in_db if total_relevant_in_db > 0 else 0.0

    return {
        'recall_at_k': float(recall),
        'found': int(relevant_in_top_k),
        'total_in_db': int(total_relevant_in_db),
        'missed': int(total_relevant_in_db - relevant_in_top_k),
        'k': k,
        'threshold': relevance_threshold,
        'retrieved_k': len(top_k_docs)
    }

print("METRIC 2 (RECALL@K - MultiQuery) loaded")


METRIC 2 (RECALL@K - MultiQuery) loaded


In [32]:
# TEST: Recall with Multi-Query Retriever
test_q = "How is hepatic fibrosis assessed according to AASLD?"
r_mq = metric_2_recall_multiquery(test_q, chunks, k=5, relevance_threshold=0.65)

print("\n" + "="*70)
print("RECALL@5 (MULTI-QUERY)")
print("="*70)

# Check for errors first
if 'error' in r_mq:
    print(f"Error: {r_mq['error']}")
    if 'note' in r_mq:
        print(f"   Note: {r_mq['note']}")
else:
    print(f"Recall@5:       {r_mq['recall_at_k']:.4f}")
    print(f"Found:          {r_mq['found']} / {r_mq['total_in_db']} relevant docs")
    print(f"Missed:         {r_mq['missed']} docs")
    print(f"Retrieved:      {r_mq['retrieved_k']} docs (k={r_mq['k']})")
    print(f"Threshold:      {r_mq['threshold']}")

    if r_mq['recall_at_k'] > 0.6:
        print("Status:         GOOD")
    elif r_mq['recall_at_k'] > 0.4:
        print("Status:         FAIR")
    elif r_mq['recall_at_k'] > 0.0:
        print("Status:         LOW")
    else:
        print("Status:         ZERO (no relevant docs found in top-k)")

print("="*70)


[DEBUG] Generated 3 related questions:
  - How are non-alcoholic steatohepatitis (NASH) and alcoholic liver disease (ALD) distinguished in clinical practice?
  - What are the diagnostic criteria for the assessment of fibrosis and cirrhosis in chronic liver disease?
  - How is the diagnosis of fibrosis and cirrhosis determined in patients with liver disease caused by viral hepatitis?
   Scanning all 12788 chunks...

RECALL@5 (MULTI-QUERY)
Recall@5:       0.1290
Found:          4 / 31 relevant docs
Missed:         27 docs
Retrieved:      5 docs (k=5)
Threshold:      0.65
Status:         LOW


In [33]:
# ======================================
# METRIC 3: MRR@K (MEAN RECIPROCAL RANK) - RERANKER
# ======================================
def metric_3_mrr_at_k_reranker(question: str, k=5, relevance_threshold=0.65):
    """
    MRR@K: Mean Reciprocal Rank - Position of FIRST relevant doc in top-K
    Uses generate_answer_with_rerank (WITH re-ranker)

    MRR = 1 / rank_of_first_relevant_doc
    Higher MRR = relevant docs appear earlier in ranking
    """

    # Get answer with re-ranker
    result = generate_answer_with_rerank(question, k_per_query=3, n_related=3, max_retrieved=10, rerank_top_k=k)
    retrieved_docs = result.get("sources", [])

    if not retrieved_docs:
        return {
            'mrr_at_k': 0.0,
            'rank_of_first': None,
            'first_sim': None,
            'error': 'No docs retrieved',
            'k': k,
            'threshold': relevance_threshold
        }

    try:
        question_embedding = embedder.embed_query(question)
    except Exception as e:
        return {
            'mrr_at_k': 0.0,
            'rank_of_first': None,
            'first_sim': None,
            'error': f'Embedding error: {str(e)}',
            'k': k,
            'threshold': relevance_threshold
        }

    # ===== Get similarities for top-K =====
    top_k_docs = retrieved_docs[:k]
    top_k_texts = [doc.get('text', doc.get('heading', doc.get('page_content', ''))) for doc in top_k_docs]

    try:
        top_k_embeddings = embedder.embed_documents(top_k_texts)
        top_k_embeddings = np.array(top_k_embeddings, dtype=np.float32)
        similarities = cosine_similarity([question_embedding], top_k_embeddings)[0]
    except Exception as e:
        return {
            'mrr_at_k': 0.0,
            'rank_of_first': None,
            'first_sim': None,
            'error': f'Similarity computation error: {str(e)}',
            'k': k,
            'threshold': relevance_threshold
        }

    # ===== Find rank of first relevant doc =====
    for rank, sim in enumerate(similarities, start=1):
        if sim > relevance_threshold:
            mrr = 1.0 / rank
            return {
                'mrr_at_k': float(mrr),
                'rank_of_first': int(rank),
                'first_sim': float(sim),
                'k': k,
                'threshold': relevance_threshold,
                'note': 'Relevant doc found'
            }

    # No relevant doc found in top-K
    return {
        'mrr_at_k': 0.0,
        'rank_of_first': None,
        'first_sim': None,
        'note': 'No relevant doc in top-K',
        'k': k,
        'threshold': relevance_threshold
    }

print("METRIC 3 (MRR@K - ReRanker) loaded")


METRIC 3 (MRR@K - ReRanker) loaded


In [34]:
# TEST: MRR with Re-Ranker
test_q = "How is hepatic fibrosis assessed according to AASLD?"
m_rr = metric_3_mrr_at_k_reranker(test_q, k=5, relevance_threshold=0.65)

print("\n" + "="*70)
print("MRR@5 (RE-RANKER)")
print("="*70)

if 'error' in m_rr:
    print(f"Error: {m_rr['error']}")
else:
    print(f"MRR@5:                  {m_rr['mrr_at_k']:.4f}")

    if m_rr['rank_of_first'] is not None:
        print(f"First relevant doc at:  Rank {m_rr['rank_of_first']}")
        print(f"Similarity score:       {m_rr['first_sim']:.4f}")
        print(f"Threshold:              {m_rr['threshold']}")

        if m_rr['rank_of_first'] == 1:
            print("Status:                 PERFECT (relevant at rank 1)")
        elif m_rr['rank_of_first'] <= 3:
            print("Status:                 GOOD (relevant in top-3)")
        elif m_rr['rank_of_first'] <= 5:
            print("Status:                 FAIR (relevant but not top-3)")
        else:
            print("Status:                 LOW")
    else:
        print("First relevant doc:     NOT FOUND")
        print(f"Status:                 {m_rr.get('note', 'No relevant docs in top-K')}")

print("="*70)



[STEP 1] Multi-Query Retrieval...
[DEBUG] Generated 3 related questions:
  - How do AASLD fibrosis scores incorporate histological features?
  - What is the role of transient elastography in assessing hepatic fibrosis?
  - How do AASLD fibrosis scores account for non-alcoholic steatohepatitis (NASH) and its complications?
  Retrieved 10 candidates in 2.777s

[STEP 2] Re-Ranking with Cross-Encoder...
   Re-ranked 10 docs → top 5
     1. Supplemental Digital Content... (score: 5.610)
     2. INTRODUCTION... (score: 4.743)
     3. Conclusions:... (score: 3.750)
     4. BACKGROUND... (score: 1.724)
     5. Conclusions:... (score: -3.631)
  Re-ranking completed in 0.021s

[STEP 3] Generating Answer...
  Answer generated in 4.896s

MRR@5 (RE-RANKER)
MRR@5:                  1.0000
First relevant doc at:  Rank 1
Similarity score:       0.7162
Threshold:              0.65
Status:                 PERFECT (relevant at rank 1)


In [35]:
# ======================================
# METRIC 4: NDCG@K - MULTIQUERY
# ======================================
def metric_4_ndcg_multiquery(question: str, k=5):
    """
    NDCG@K: Normalized Discounted Cumulative Gain - Ranking quality
    Uses multiquery_retrieve_v2 (WITHOUT re-ranker)

    NDCG ranges 0-1: 1.0 = perfect ranking, 0.0 = worst ranking
    """

    # Get docs from multiquery retriever
    retrieved_docs = multiquery_retrieve_v2(question, k_per_query=3, n_related=3, max_total=k)

    if not retrieved_docs:
        return {
            'ndcg_at_k': 0.0,
            'dcg': 0.0,
            'idcg': 0.0,
            'quality': 'Error',
            'error': 'No docs retrieved',
            'k': k
        }

    top_k_docs = retrieved_docs[:k]
    top_k_texts = [doc.get('text', doc.get('page_content', doc.get('content', ''))) for doc in top_k_docs]

    try:
        # Embed question and documents
        question_embedding = embedder.embed_query(question)
        doc_embeddings = embedder.embed_documents(top_k_texts)
        doc_embeddings = np.array(doc_embeddings, dtype=np.float32)

        # Get similarity scores as relevance gains
        relevances = cosine_similarity([question_embedding], doc_embeddings)[0]

    except Exception as e:
        return {
            'ndcg_at_k': 0.0,
            'dcg': 0.0,
            'idcg': 0.0,
            'quality': 'Error',
            'error': f'Embedding error: {str(e)}',
            'k': k
        }

    # ===== Calculate DCG (Discounted Cumulative Gain) =====
    dcg = 0.0
    for rank, relevance in enumerate(relevances, start=1):
        dcg += relevance / np.log2(rank + 1)

    # ===== Calculate IDCG (Ideal DCG - perfect ranking) =====
    ideal_relevances = np.sort(relevances)[::-1]  # Sort descending
    idcg = 0.0
    for rank, relevance in enumerate(ideal_relevances, start=1):
        idcg += relevance / np.log2(rank + 1)

    # ===== Normalize =====
    ndcg = dcg / idcg if idcg > 0 else 0.0

    # Determine quality rating
    if ndcg > 0.9:
        quality = "Excellent"
    elif ndcg > 0.7:
        quality = "Good"
    elif ndcg > 0.5:
        quality = "Fair"
    else:
        quality = "Poor"

    return {
        'ndcg_at_k': float(ndcg),
        'dcg': float(dcg),
        'idcg': float(idcg),
        'quality': quality,
        'relevances': [float(r) for r in relevances],
        'ideal_relevances': [float(r) for r in ideal_relevances],
        'k': k
    }

print("METRIC 4 (NDCG@K - MultiQuery) loaded")


METRIC 4 (NDCG@K - MultiQuery) loaded


In [36]:
# TEST: NDCG with Multi-Query Retriever
test_q = "How is hepatic fibrosis assessed according to AASLD?"
n_mq = metric_4_ndcg_multiquery(test_q, k=5)

print("\n" + "="*70)
print("NDCG@5 (MULTI-QUERY)")
print("="*70)

if 'error' in n_mq:
    print(f"Error: {n_mq['error']}")
else:
    print(f"NDCG@5:                 {n_mq['ndcg_at_k']:.4f}")
    print(f"Quality:                {n_mq['quality']}")
    print(f"DCG:                    {n_mq['dcg']:.4f}")
    print(f"IDCG (Ideal):           {n_mq['idcg']:.4f}")

    print(f"\n└─ Relevance scores (current ranking):")
    for rank, rel in enumerate(n_mq['relevances'], 1):
        print(f"   Rank {rank}: {rel:.4f}")

    if n_mq['ndcg_at_k'] > 0.9:
        print("\nStatus:                 EXCELLENT ranking quality")
    elif n_mq['ndcg_at_k'] > 0.7:
        print("\nStatus:                 GOOD ranking quality")
    elif n_mq['ndcg_at_k'] > 0.5:
        print("\nStatus:                 FAIR ranking quality")
    else:
        print("\nStatus:                 POOR ranking quality")

print("="*70)


[DEBUG] Generated 3 related questions:
  - What is the purpose of the Ishak score in liver fibrosis assessment?
  - How does the METAVIR scoring system differ from the Ishak score?
  - What is the role of transient elastography in liver fibrosis staging?

NDCG@5 (MULTI-QUERY)
NDCG@5:                 1.0000
Quality:                Excellent
DCG:                    1.9772
IDCG (Ideal):           1.9772

└─ Relevance scores (current ranking):
   Rank 1: 0.7162
   Rank 2: 0.7128
   Rank 3: 0.7097
   Rank 4: 0.6041
   Rank 5: 0.5073

Status:                 EXCELLENT ranking quality


In [37]:
# ======================================
# METRIC 4: NDCG@K - RERANKER
# ======================================
def metric_4_ndcg_reranker(question: str, k=5):
    """
    NDCG@K: Normalized Discounted Cumulative Gain - Ranking quality
    Uses generate_answer_with_rerank (WITH re-ranker)
    """

    # Get answer with re-ranker
    result = generate_answer_with_rerank(question, k_per_query=3, n_related=3, max_retrieved=10, rerank_top_k=k)
    retrieved_docs = result.get("sources", [])

    if not retrieved_docs:
        return {
            'ndcg_at_k': 0.0,
            'dcg': 0.0,
            'idcg': 0.0,
            'quality': 'Error',
            'error': 'No docs retrieved',
            'k': k
        }

    top_k_docs = retrieved_docs[:k]
    top_k_texts = [doc.get('text', doc.get('heading', doc.get('page_content', ''))) for doc in top_k_docs]

    try:
        # Embed question and documents
        question_embedding = embedder.embed_query(question)
        doc_embeddings = embedder.embed_documents(top_k_texts)
        doc_embeddings = np.array(doc_embeddings, dtype=np.float32)

        # Get similarity scores as relevance gains
        relevances = cosine_similarity([question_embedding], doc_embeddings)[0]

    except Exception as e:
        return {
            'ndcg_at_k': 0.0,
            'dcg': 0.0,
            'idcg': 0.0,
            'quality': 'Error',
            'error': f'Embedding error: {str(e)}',
            'k': k
        }

    # ===== Calculate DCG (Discounted Cumulative Gain) =====
    dcg = 0.0
    for rank, relevance in enumerate(relevances, start=1):
        dcg += relevance / np.log2(rank + 1)

    # ===== Calculate IDCG (Ideal DCG - perfect ranking) =====
    ideal_relevances = np.sort(relevances)[::-1]  # Sort descending
    idcg = 0.0
    for rank, relevance in enumerate(ideal_relevances, start=1):
        idcg += relevance / np.log2(rank + 1)

    # ===== Normalize =====
    ndcg = dcg / idcg if idcg > 0 else 0.0

    # Determine quality rating
    if ndcg > 0.9:
        quality = "Excellent"
    elif ndcg > 0.7:
        quality = "Good"
    elif ndcg > 0.5:
        quality = "Fair"
    else:
        quality = "Poor"

    return {
        'ndcg_at_k': float(ndcg),
        'dcg': float(dcg),
        'idcg': float(idcg),
        'quality': quality,
        'relevances': [float(r) for r in relevances],
        'ideal_relevances': [float(r) for r in ideal_relevances],
        'k': k
    }

print("METRIC 4 (NDCG@K - ReRanker) loaded")


METRIC 4 (NDCG@K - ReRanker) loaded


In [38]:
# TEST: NDCG with Re-Ranker
test_q = "How is hepatic fibrosis assessed according to AASLD?"
n_rr = metric_4_ndcg_reranker(test_q, k=5)

print("\n" + "="*70)
print("NDCG@5 (RE-RANKER)")
print("="*70)

if 'error' in n_rr:
    print(f"Error: {n_rr['error']}")
else:
    print(f"NDCG@5:                 {n_rr['ndcg_at_k']:.4f}")
    print(f"Quality:                {n_rr['quality']}")
    print(f"DCG:                    {n_rr['dcg']:.4f}")
    print(f"IDCG (Ideal):           {n_rr['idcg']:.4f}")

    print(f"\n└─ Relevance scores (current ranking):")
    for rank, rel in enumerate(n_rr['relevances'], 1):
        print(f"   Rank {rank}: {rel:.4f}")

    if n_rr['ndcg_at_k'] > 0.9:
        print("\nStatus:                 EXCELLENT ranking quality")
    elif n_rr['ndcg_at_k'] > 0.7:
        print("\nStatus:                 GOOD ranking quality")
    elif n_rr['ndcg_at_k'] > 0.5:
        print("\nStatus:                 FAIR ranking quality")
    else:
        print("\nStatus:                 POOR ranking quality")

print("\n" + "-"*70)
print("COMPARISON: MultiQuery vs ReRanker (NDCG@5)")
print("-"*70)
print(f"MultiQuery NDCG@5:      {n_mq['ndcg_at_k']:.4f} ({n_mq['quality']})")
print(f"ReRanker NDCG@5:        {n_rr['ndcg_at_k']:.4f} ({n_rr['quality']})")

if n_mq['ndcg_at_k'] > 0:
    improvement_pct = ((n_rr['ndcg_at_k'] - n_mq['ndcg_at_k']) / n_mq['ndcg_at_k']) * 100
    print(f"Improvement:            {improvement_pct:.1f}%")
else:
    print(f"Improvement:            ReRanker outperformed (MultiQuery was 0)")

print("="*70)



[STEP 1] Multi-Query Retrieval...
[DEBUG] Generated 3 related questions:
  - How are liver biopsy results scored according to AASLD?
  - What are the different types of liver biopsy scoring systems?
  - What is the role of transient elastography in assessing liver fibrosis?
  Retrieved 10 candidates in 1.867s

[STEP 2] Re-Ranking with Cross-Encoder...
   Re-ranked 10 docs → top 5
     1. Supplemental Digital Content... (score: 5.610)
     2. INTRODUCTION... (score: 4.743)
     3. Conclusions:... (score: 3.750)
     4. SUPPORTING INFORMATION... (score: -0.823)
     5. Conclusions:... (score: -3.906)
  Re-ranking completed in 0.016s

[STEP 3] Generating Answer...
  Answer generated in 14.790s

NDCG@5 (RE-RANKER)
NDCG@5:                 0.9998
Quality:                Excellent
DCG:                    2.0251
IDCG (Ideal):           2.0255

└─ Relevance scores (current ranking):
   Rank 1: 0.7162
   Rank 2: 0.7097
   Rank 3: 0.7128
   Rank 4: 0.6210
   Rank 5: 0.6133

Status:              

In [39]:
 !pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4df4c60d148a9be7a12813aafeed7edb3fb57ec9a67c854b757bcb0bfc7a704a
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [40]:
# ======================================
# METRIC 5: ROUGE - MULTIQUERY (OPTIMIZED)
# ======================================
def metric_5_rouge_multiquery(generated_answer: str, sources: list, question: str = ""):
    """
    ROUGE: Answer quality assessment using multi-query retrieval

    Args:
        generated_answer: The answer text (already generated)
        sources: Retrieved source documents
        question: Original question (fallback for reference)
    """

    if not generated_answer:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'avg': 0.0,
            'error': 'Missing answer',
            'answer_length': 0
        }

    # Build reference from top-2 chunks
    ref_texts = []
    for doc in sources[:2]:
        text = doc.get('text', doc.get('page_content', ''))
        if text:
            ref_texts.append(text[:300])  # Use first 300 chars
    reference_answer = ' '.join(ref_texts) if ref_texts else question

    if not reference_answer:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'avg': 0.0,
            'error': 'No reference text',
            'answer_length': len(generated_answer)
        }

    try:
        from rouge_score import rouge_scorer
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(reference_answer, generated_answer)

        return {
            'rouge1': float(scores['rouge1'].fmeasure),
            'rouge2': float(scores['rouge2'].fmeasure),
            'rougeL': float(scores['rougeL'].fmeasure),
            'avg': float((scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3),
            'answer_length': len(generated_answer)
        }

    except ImportError:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'avg': 0.0,
            'error': 'rouge_score not installed',
            'answer_length': len(generated_answer)
        }
    except Exception as e:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'avg': 0.0,
            'error': f'ROUGE error: {str(e)}',
            'answer_length': len(generated_answer)
        }

print("METRIC 5 (ROUGE - MultiQuery) loaded - OPTIMIZED")


METRIC 5 (ROUGE - MultiQuery) loaded - OPTIMIZED


In [41]:
# TEST: ROUGE with Multi-Query Retriever
test_q = "How is hepatic fibrosis assessed?"

print("\n" + "="*100)
print("ROUGE (MULTI-QUERY)")
print("="*100)
print(f"Question: {test_q}\n")

# Generate answer once
result_mq = generate_answer_multiquery_v2(test_q, k_per_query=3, n_related=3, max_total=5)
answer_mq = result_mq.get("answer", "")
sources_mq = result_mq.get("sources", [])

# Calculate ROUGE
ro_mq = metric_5_rouge_multiquery(answer_mq, sources_mq, test_q)

print(f"ROUGE-1 (unigram):      {ro_mq.get('rouge1', 0.0):.4f}")
print(f"ROUGE-2 (bigram):       {ro_mq.get('rouge2', 0.0):.4f}")
print(f"ROUGE-L (sequence):     {ro_mq.get('rougeL', 0.0):.4f}")
print(f"Average ROUGE:          {ro_mq.get('avg', 0.0):.4f}")
print(f"Answer length:          {ro_mq.get('answer_length', 0)} chars")

if ro_mq.get('error'):
    print(f"\nNote: {ro_mq['error']}")

# Show quality assessment
avg_score = ro_mq.get('avg', 0.0)
if avg_score > 0.5:
    print("\nStatus:                 GOOD (High overlap with reference)")
elif avg_score > 0.2:
    print("\nStatus:                 FAIR (Some overlap)")
else:
    print("\nStatus:                 LOW (Minimal overlap - likely hallucination)")

print("\n" + "-"*100)
print("REFERENCE (from top-2 chunks)")
print("-"*100)
ref_texts = []
for i, doc in enumerate(sources_mq[:2], 1):
    text = doc.get('text', doc.get('page_content', ''))[:200]
    print(f"\n[Chunk {i}]:\n{text}...")
    ref_texts.append(text)

print("\n" + "-"*100)
print("GENERATED ANSWER")
print("-"*100)
print(f"\n{answer_mq}")

print("\n" + "="*100)



ROUGE (MULTI-QUERY)
Question: How is hepatic fibrosis assessed?

[DEBUG] Generated 3 related questions:
  - How does the Child-Turcotte-Pugh score assess liver function?
  - What are the key differences between liver biopsy and non-invasive assessment of liver fibrosis?
  - How is liver fibrosis staged in patients with cirrhosis?
ROUGE-1 (unigram):      0.5139
ROUGE-2 (bigram):       0.3662
ROUGE-L (sequence):     0.3889
Average ROUGE:          0.4230
Answer length:          507 chars

Status:                 FAIR (Some overlap)

----------------------------------------------------------------------------------------------------
REFERENCE (from top-2 chunks)
----------------------------------------------------------------------------------------------------

[Chunk 1]:
Histologically, the degree of fibrosis in chronic liver disease can be evaluated semiquantitatively in liver biopsy, with stages 0–2 defining early fibrosis stages, F3 bridging (advanced) fibrosis, an...

[Chunk 2]:
. S

In [42]:
# ======================================
# METRIC 5: ROUGE - RERANKER (OPTIMIZED)
# ======================================
def metric_5_rouge_reranker(generated_answer: str, sources: list, question: str = ""):
    """
    ROUGE: Answer quality assessment using re-ranker

    Args:
        generated_answer: The answer text (already generated)
        sources: Re-ranked source documents
        question: Original question (fallback for reference)
    """

    if not generated_answer:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'avg': 0.0,
            'error': 'Missing answer',
            'answer_length': 0
        }

    # Build reference from top-2 chunks
    ref_texts = []
    for doc in sources[:2]:
        text = doc.get('text', doc.get('page_content', ''))
        if text:
            ref_texts.append(text[:300])  # Use first 300 chars
    reference_answer = ' '.join(ref_texts) if ref_texts else question

    if not reference_answer:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'avg': 0.0,
            'error': 'No reference text',
            'answer_length': len(generated_answer)
        }

    try:
        from rouge_score import rouge_scorer
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(reference_answer, generated_answer)

        return {
            'rouge1': float(scores['rouge1'].fmeasure),
            'rouge2': float(scores['rouge2'].fmeasure),
            'rougeL': float(scores['rougeL'].fmeasure),
            'avg': float((scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3),
            'answer_length': len(generated_answer)
        }

    except ImportError:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'avg': 0.0,
            'error': 'rouge_score not installed',
            'answer_length': len(generated_answer)
        }
    except Exception as e:
        return {
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0,
            'avg': 0.0,
            'error': f'ROUGE error: {str(e)}',
            'answer_length': len(generated_answer)
        }

print("METRIC 5 (ROUGE - ReRanker) loaded - OPTIMIZED")


METRIC 5 (ROUGE - ReRanker) loaded - OPTIMIZED


In [43]:
# TEST: ROUGE with Re-Ranker
test_q = "How is hepatic fibrosis assessed?"

print("\n" + "="*100)
print("ROUGE (RE-RANKER)")
print("="*100)
print(f"Question: {test_q}\n")

# Generate answer once
result_rr = generate_answer_with_rerank(test_q, k_per_query=3, n_related=3, max_retrieved=10, rerank_top_k=5)
answer_rr = result_rr.get("answer", "")
sources_rr = result_rr.get("sources", [])

# Calculate ROUGE
ro_rr = metric_5_rouge_reranker(answer_rr, sources_rr, test_q)

print(f"ROUGE-1 (unigram):      {ro_rr.get('rouge1', 0.0):.4f}")
print(f"ROUGE-2 (bigram):       {ro_rr.get('rouge2', 0.0):.4f}")
print(f"ROUGE-L (sequence):     {ro_rr.get('rougeL', 0.0):.4f}")
print(f"Average ROUGE:          {ro_rr.get('avg', 0.0):.4f}")
print(f"Answer length:          {ro_rr.get('answer_length', 0)} chars")

if ro_rr.get('error'):
    print(f"\nNote: {ro_rr['error']}")

# Show quality assessment
avg_score = ro_rr.get('avg', 0.0)
if avg_score > 0.5:
    print("\nStatus:                 GOOD (High overlap with reference)")
elif avg_score > 0.2:
    print("\nStatus:                 FAIR (Some overlap)")
else:
    print("\nStatus:                 LOW (Minimal overlap - likely hallucination)")

print("\n" + "-"*100)
print("REFERENCE (from top-2 chunks)")
print("-"*100)
ref_texts = []
for i, doc in enumerate(sources_rr[:2], 1):
    text = doc.get('text', doc.get('page_content', ''))[:200]
    print(f"\n[Chunk {i}]:\n{text}...")
    ref_texts.append(text)

print("\n" + "-"*100)
print("GENERATED ANSWER")
print("-"*100)
print(f"\n{answer_rr}")

print("\n" + "-"*100)
print("COMPARISON: MultiQuery vs ReRanker (ROUGE)")
print("-"*100)
print(f"MultiQuery ROUGE Avg:   {ro_mq.get('avg', 0.0):.4f}")
print(f"ReRanker ROUGE Avg:     {ro_rr.get('avg', 0.0):.4f}")

if ro_mq.get('avg', 0.0) > 0:
    improvement = ((ro_rr.get('avg', 0.0) - ro_mq.get('avg', 0.0)) / ro_mq.get('avg', 0.0)) * 100
    print(f"Improvement:            {improvement:+.1f}%")
else:
    print(f"Improvement:            Cannot compute (MultiQuery is 0)")

print("\n" + "="*100)



ROUGE (RE-RANKER)
Question: How is hepatic fibrosis assessed?


[STEP 1] Multi-Query Retrieval...
[DEBUG] Generated 3 related questions:
  - What are the benefits of using a transient elastography device?
  - How is the role of alpha-1 antitrypsin deficiency in liver disease progression assessed?
  - What are the diagnostic criteria for cirrhosis based on histopathology?
  Retrieved 10 candidates in 3.467s

[STEP 2] Re-Ranking with Cross-Encoder...
   Re-ranked 10 docs → top 5
     1. Prognosis.... (score: 2.496)
     2. Conclusions:... (score: 2.294)
     3. US-based elastography... (score: 2.175)
     4. Conclusions:... (score: -3.403)
     5. Imaging techniques... (score: -6.805)
  Re-ranking completed in 0.018s

[STEP 3] Generating Answer...
  Answer generated in 12.381s
ROUGE-1 (unigram):      0.4675
ROUGE-2 (bigram):       0.2183
ROUGE-L (sequence):     0.3117
Average ROUGE:          0.3325
Answer length:          1075 chars

Status:                 FAIR (Some overlap)

--------

In [44]:
# ======================================
# FINAL BATCH EVALUATION REPORT
# 5 BALANCED QUESTIONS × 7 COLUMNS
# ======================================
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

TAU = 0.65
K = 5

test_questions = [
    {'num': 1, 'question': "How is Wilson disease diagnosed and what are the diagnostic tests?"},
    {'num': 2, 'question': "What are the stages of liver fibrosis and their clinical significance?"},
    {'num': 3, 'question': "What are the diagnostic features and complications of cirrhosis?"},
    {'num': 4, 'question': "What factors determine eligibility for liver transplantation?"},
    {'num': 5, 'question': "What are the recommended screening methods for hepatocellular carcinoma?"}
]

results_data = []

print("\n" + "="*130)
print("FINAL BATCH EVALUATION: 5 BALANCED QUESTIONS × 7 METRICS")
print("="*130)
print("Columns: Precision (MQ/RR) | NDCG (MQ/RR) | ROUGE (MQ/RR) | ROUGE Improvement %")
print("="*130)

# Run evaluation
for q_data in test_questions:
    question = q_data['question']
    q_num = q_data['num']

    print(f"\n{'─'*130}")
    print(f"Q{q_num}: {question}")
    print(f"{'─'*130}")

    result_row = {'Q': q_num}

    try:
        # Generate answer ONCE for MultiQuery
        result_mq = generate_answer_multiquery_v2(question, k_per_query=3, n_related=3, max_total=K)
        answer_mq = result_mq.get("answer", "")
        sources_mq = result_mq.get("sources", [])

        # Generate answer ONCE for ReRanker
        result_rr = generate_answer_with_rerank(question, k_per_query=3, n_related=3, max_retrieved=10, rerank_top_k=K)
        answer_rr = result_rr.get("answer", "")
        sources_rr = result_rr.get("sources", [])

        # ===== PRECISION =====
        qv = embedder.embed_query(question)

        # Precision MQ
        texts_mq = [doc.get('text', doc.get('page_content', '')) for doc in sources_mq[:K]]
        if texts_mq:
            embs_mq = np.array(embedder.embed_documents(texts_mq), dtype=np.float32)
            sims_mq = cosine_similarity([qv], embs_mq)[0]
            p_mq_score = sum(1 for sim in sims_mq if sim > TAU) / K
        else:
            p_mq_score = 0.0
        result_row['Precision_MQ'] = p_mq_score

        # Precision RR
        texts_rr = [doc.get('text', doc.get('heading', '')) for doc in sources_rr[:K]]
        if texts_rr:
            embs_rr = np.array(embedder.embed_documents(texts_rr), dtype=np.float32)
            sims_rr = cosine_similarity([qv], embs_rr)[0]
            p_rr_score = sum(1 for sim in sims_rr if sim > TAU) / K
        else:
            p_rr_score = 0.0
        result_row['Precision_RR'] = p_rr_score

        # ===== NDCG =====
        # NDCG MQ
        if texts_mq:
            gains_mq = sims_mq
            dcg_mq = sum(g / np.log2(i+2) for i, g in enumerate(gains_mq))
            ideal_mq = np.sort(gains_mq)[::-1]
            idcg_mq = sum(g / np.log2(i+2) for i, g in enumerate(ideal_mq))
            ndcg_mq_score = dcg_mq / idcg_mq if idcg_mq > 0 else 0.0
        else:
            ndcg_mq_score = 0.0
        result_row['NDCG_MQ'] = ndcg_mq_score

        # NDCG RR
        if texts_rr:
            gains_rr = sims_rr
            dcg_rr = sum(g / np.log2(i+2) for i, g in enumerate(gains_rr))
            ideal_rr = np.sort(gains_rr)[::-1]
            idcg_rr = sum(g / np.log2(i+2) for i, g in enumerate(ideal_rr))
            ndcg_rr_score = dcg_rr / idcg_rr if idcg_rr > 0 else 0.0
        else:
            ndcg_rr_score = 0.0
        result_row['NDCG_RR'] = ndcg_rr_score

        # ===== ROUGE =====
        # ROUGE MQ
        rouge_mq = metric_5_rouge_multiquery(answer_mq, sources_mq, question)
        r_mq_score = rouge_mq.get('avg', 0.0)
        result_row['ROUGE_MQ'] = r_mq_score

        # ROUGE RR
        rouge_rr = metric_5_rouge_reranker(answer_rr, sources_rr, question)
        r_rr_score = rouge_rr.get('avg', 0.0)
        result_row['ROUGE_RR'] = r_rr_score

        # ROUGE Improvement %
        rouge_improvement = ((r_rr_score - r_mq_score) / max(r_mq_score, 0.0001)) * 100
        result_row['ROUGE_Improvement_%'] = rouge_improvement

        print(f"PRECISION:  {p_mq_score:.4f} (MQ) | {p_rr_score:.4f} (RR)")
        print(f"NDCG:       {ndcg_mq_score:.4f} (MQ) | {ndcg_rr_score:.4f} (RR)")
        print(f"ROUGE:      {r_mq_score:.4f} (MQ) | {r_rr_score:.4f} (RR) | {rouge_improvement:+.1f}% ↑")

    except Exception as e:
        print(f"Error: {str(e)[:60]}")
        result_row['Precision_MQ'] = result_row['Precision_RR'] = 0.0
        result_row['NDCG_MQ'] = result_row['NDCG_RR'] = 0.0
        result_row['ROUGE_MQ'] = result_row['ROUGE_RR'] = 0.0
        result_row['ROUGE_Improvement_%'] = 0.0

    results_data.append(result_row)

# Create DataFrame
df_results = pd.DataFrame(results_data)

# Print final results table
print("\n\n" + "="*130)
print("FINAL RESULTS TABLE (7 Columns)")
print("="*130)

final_table = df_results[['Q', 'Precision_MQ', 'Precision_RR', 'NDCG_MQ', 'NDCG_RR', 'ROUGE_MQ', 'ROUGE_RR', 'ROUGE_Improvement_%']].copy()

print(final_table.to_string(index=False))

# Overall summary
print("\n\n" + "="*130)
print("OVERALL SUMMARY (5 Questions Average)")
print("="*130)

print(f"\n{'Metric':<25} {'MultiQuery':<20} {'ReRanker':<20}")
print("─" * 65)
print(f"{'Precision@5':<25} {df_results['Precision_MQ'].mean():<20.4f} {df_results['Precision_RR'].mean():<20.4f}")
print(f"{'NDCG@5':<25} {df_results['NDCG_MQ'].mean():<20.4f} {df_results['NDCG_RR'].mean():<20.4f}")
print(f"{'ROUGE (avg)':<25} {df_results['ROUGE_MQ'].mean():<20.4f} {df_results['ROUGE_RR'].mean():<20.4f}")
print(f"{'ROUGE Improvement':<25} {'':<20} {df_results['ROUGE_Improvement_%'].mean():>19.1f}%")

# Save to CSV
csv_filename = f"final_evaluation_Llama.csv"
df_results.to_csv(csv_filename, index=False)
print(f"\n\nResults exported to: {csv_filename}")

print("\n" + "="*130)
print("FINAL EVALUATION COMPLETE")
print("="*130)



FINAL BATCH EVALUATION: 5 BALANCED QUESTIONS × 7 METRICS
Columns: Precision (MQ/RR) | NDCG (MQ/RR) | ROUGE (MQ/RR) | ROUGE Improvement %

──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Q1: How is Wilson disease diagnosed and what are the diagnostic tests?
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
[DEBUG] Generated 3 related questions:
  - What are the genetic mutations associated with Wilson disease?
  - How does the diagnosis of Wilson disease differ in patients with hereditary hemochromatosis?
  - What is the role of 24-hour urinary copper excretion in the diagnosis of Wilson disease?

[STEP 1] Multi-Query Retrieval...
[DEBUG] Generated 3 related questions:
  - How is Wilson disease confirmed after initial diagnosis?
  - What are the differences between genetic testing and liver function tests in diagnosing Wilso