# Agentic RAG for Deep Document Analysis

## 1. Setting up the RAG environment

In [None]:
# !pip install openai pypdf nltk tqdm requests pandas



In [None]:
import os
import json
import re
import time
from io import BytesIO # to handle in-memory binary streams
from transformers import AutoTokenizer
from openai import OpenAI
from pypdf import PdfReader
from typing import List, Dict, Any
import nltk
import nltk
nltk.download('punkt_tab')
import pandas as pd
from nltk.tokenize import sent_tokenize
from tqdm.auto import tqdm
import requests

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# LLM Configuration

API_KEY = "eyJhbGciOiJIUzI1NiIsImtpZCI6IlV6SXJWd1h0dnprLVRvdzlLZWstc0M1akptWXBvX1VaVkxUZlpnMDRlOFUiLCJ0eXAiOiJKV1QifQ.eyJzdWIiOiJnaXRodWJ8MTQ2Njc1MDMwIiwic2NvcGUiOiJvcGVuaWQgb2ZmbGluZV9hY2Nlc3MiLCJpc3MiOiJhcGlfa2V5X2lzc3VlciIsImF1ZCI6WyJodHRwczovL25lYml1cy1pbmZlcmVuY2UuZXUuYXV0aDAuY29tL2FwaS92Mi8iXSwiZXhwIjoxOTE4MzAyNTU4LCJ1dWlkIjoiMDE5OWVkNDgtMzY2ZC03ZjFlLTgzYzgtNDQ0MzIyZWEyOGIyIiwibmFtZSI6ImZvci1sbG0tdGVzdGluZyIsImV4cGlyZXNfYXQiOiIyMDMwLTEwLTE1VDEzOjQ5OjE4KzAwMDAifQ.Zv_3-uYGXJc1s_5Fn8yyjvBqPKx6pHRNk4jqPpo_rwo"
BASE_URL = "https://api.studio.nebius.com/v1/"

ROUTER_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
SYNTHESIS_MODEL = "meta-llama/Llama-3.3-70B-Instruct"
EVALUATION_MODEL = "deepseek-ai/DeepSeek-V3"

client = OpenAI(
    api_key=API_KEY,
    base_url=BASE_URL
)

metrics_log = [] # to store performance metrics

In [None]:
print(f"Initializing tokenizer for '{ROUTER_MODEL}'...")
from huggingface_hub import login
login()  # This will prompt you for your Hugging Face token

tokenizer = AutoTokenizer.from_pretrained(ROUTER_MODEL) # using the tokenizer from desingated router model for consistent token counting

def count_tokens(text: str) -> int:
  # estimates the number of tokens in a string using the reference tokenizer
  if not isinstance(text, str):
    return 0
  return len(tokenizer.encode(text))

Initializing tokenizer for 'meta-llama/Meta-Llama-3.1-8B-Instruct'...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

## 2. Document Loading

In [None]:
def load_pdf_from_url(url: str, max_pages: int=920) -> str:
  # downloads a pdf, extracts its text, returns as a string
  print(f"Downloading a document from {url}...")
  try:
    response = requests.get(url)
    response.raise_for_status()
  except requests.exceptions.RequestException as e:
    print(f"Error downloading the document: {e}")
    return ""

  pdf_file = BytesIO(response.content)
  pdf_reader = PdfReader(pdf_file)

  num_pages_to_process = min(max_pages, len(pdf_reader.pages))
  print(f"Extracting text from {num_pages_to_process} pages...")

  full_text = ""
  # tqdm for progress bar
  for page in tqdm(pdf_reader.pages[:num_pages_to_process], desc="Extracting pages"):
    page_text = page.extract_text()
    if page_text:
      full_text += page_text + "\n"

  return full_text

In [None]:
tbmp_url_1 = "https://eur-lex.europa.eu/resource.html?uri=cellar:e0649735-a372-11eb-9585-01aa75ed71a1.0001.02/DOC_1&format=PDF"
tbmp_url = "https://www.uspto.gov/sites/default/files/documents/tbmp-Master-June2024.pdf"

document_text = load_pdf_from_url(tbmp_url)

char_count = len(document_text)
page_count = document_text.count("\n") + 1
token_count = count_tokens(document_text)


print(f"\nDocument loaded successfully.")
print(f"- Total Characters: {char_count:,}")
print(f"- Estimated Tokens: {token_count:,}")
print(f"- Estimated Pages: {page_count:,}")

print("\n--- Document Preview (first 500 characters) ---")
print(document_text[:500])
print("---------------------------------------------")

Downloading a document from https://www.uspto.gov/sites/default/files/documents/tbmp-Master-June2024.pdf...
Extracting text from 920 pages...


Extracting pages:   0%|          | 0/920 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (945084 > 131072). Running this sequence through the model will result in indexing errors



Document loaded successfully.
- Total Characters: 3,459,491
- Estimated Tokens: 945,084
- Estimated Pages: 39,022

--- Document Preview (first 500 characters) ---
TRADEMARK TRIAL AND
APPEAL BOARD MANUAL
OF PROCEDURE (TBMP)
 June 2024
June   2024
United States Patent and Trademark Office
PREFACE TO THE JUNE 2024 REVISION
The June 2024 revision of the Trademark Trial and Appeal Board Manual of Procedure is an update of the
June 2023 edition. This update is moderate in nature and incorporates relevant case law issued between March
3, 2023 and March 1, 2024.
The title of the manual is abbreviated as “TBMP.” A citation to a section of the manual may be written
---------------------------------------------


## 3. Hierarchical Chunking

In [None]:
def split_text_into_chunks(text: str, num_chunks: int=20) -> List[Dict[str, Any]]:
  # splits a long text into a specified number of chunks considering sentence boundaries
  sentences = sent_tokenize(text)
  if not sentences:
    return []

  sentences_per_chunk = (len(sentences) + num_chunks -1) // num_chunks # sentences into each chunk on avg
  chunks = []
  desc = 'Creating chunks' if len(sentences)>500 else None # showing progress bar only for long process
  for i in tqdm(range(0, len(sentences), sentences_per_chunk), desc=desc):
    chunk_text = " ".join(sentences[i:i+sentences_per_chunk])
    chunks.append({
        "id":len(chunks),
        "text": chunk_text
    })

  print(f"Document split into {len(chunks)} chunks.")
  return chunks

In [None]:
document_chunks = split_text_into_chunks(document_text, num_chunks=20)

for chunk in document_chunks[:3]:
  chunk_token_count = count_tokens(chunk["text"])
  print(f"Chunk {chunk['id']}: {chunk_token_count} tokens")

Creating chunks:   0%|          | 0/20 [00:00<?, ?it/s]

Document split into 20 chunks.
Chunk 0: 42822 tokens
Chunk 1: 42367 tokens
Chunk 2: 42516 tokens


## 4. Agentic Navigation Workflow

In [None]:
# function for JSON parsing

def parse_json_from_response(text:str) -> Dict[str, Any]:
  match = re.search(r'```(?:json)?\s*({.*?})\s*```', text, re.S)
  if match:
    json_str = match.group(1)
  else:
    start = text.find('{')
    end = text.rfind('}')
    if start != -1 and end != -1:
      json_str = text[start:end+1]
    else:
      json_str = text

  try:
    return json.loads(json_str)
  except json.JSONDecodeError:
    print(f"Warning: Failed to parse JSON from response. Raw text: '{text}'")
    return {}

In [None]:
# Router Agent

def route_to_chunks(question: str, chunks: List[Dict[str, Any]], scratchpad: str, depth: int) -> Dict[str, Any]:

    print(f"\n--- Routing at Depth {depth}: Evaluating {len(chunks)} chunks ---")

    chunks_formatted = "\n\n".join([f"CHUNK {chunk['id']}:\n{chunk['text'][:1000]}..." for chunk in chunks])
    reasoning_prompt = f"""
    You are an expert document analyst. Your goal is to find information to answer the user's question:
    '{question}'

    Here is your reasoning so far:
    {scratchpad}

    Review the following new text chunks. Briefly explain which chunks seem relevant to the question and why. This is your internal monologue.

    TEXT CHUNKS:
    {chunks_formatted}

    Your Reasoning:
    """

    start_time = time.time()
    reasoning_response = client.chat.completions.create(model=ROUTER_MODEL, messages=[{"role":"user", "content": reasoning_prompt}], temperature=0.0)
    latency_1 = time.time() - start_time

    new_reasoning = reasoning_response.choices[0].message.content
    updated_scratchpad = scratchpad + f"\n[Depth {depth} Reasoning]: {new_reasoning}"
    print(f"LLM Reasoning: {new_reasoning}")

    p_tokens_1, c_tokens_1 = reasoning_response.usage.prompt_tokens, reasoning_response.usage.completion_tokens
    metrics_log.append({"step": f"route_depth_{depth}_reason", "model": ROUTER_MODEL, "latency_s": latency_1, "prompt_tokens": p_tokens_1, "completion_tokens": c_tokens_1, "total_tokens": p_tokens_1 + c_tokens_1})

    selection_prompt = f"""
    Based on your reasoning below, select the chunk IDs that are most likely to contain the answer to the question: '{question}'.

    Your Reasoning:
    {new_reasoning}

    TEXT CHUNKS:
    {chunks_formatted}

    Respond with ONLY a valid JSON object with a single key 'selected_chunk_ids', which is a list of integers. Example: {{"selected_chunk_ids": [1, 5, 8]}}
    """

    start_time = time.time()
    selection_response = client.chat.completions.create(model=ROUTER_MODEL, messages=[{"role": "user", "content": selection_prompt}], temperature=0.0)
    latency_2 = time.time() - start_time

    response_text = selection_response.choices[0].message.content
    parsed_output = parse_json_from_response(response_text)
    selected_ids = parsed_output.get('selected_chunk_ids', [])
    print(f"Selected chunk IDs: {selected_ids}")

    p_tokens_2, c_tokens_2 = selection_response.usage.prompt_tokens, selection_response.usage.completion_tokens
    metrics_log.append({"step": f"route_depth_{depth}_select", "model": ROUTER_MODEL, "latency_s": latency_2, "prompt_tokens": p_tokens_2, "completion_tokens": c_tokens_2, "total_tokens": p_tokens_2 + c_tokens_2})

    return {"selected_ids": selected_ids, "scratchpad": updated_scratchpad}

In [None]:
# Recursive Navigator

def navigate_document(question: str, initial_chunks: List[Dict[str, Any]], max_depth: int=2) -> Dict[str, Any]:
  scratchpad = ""
  current_chunks = initial_chunks
  final_paragraphs = []

  chunk_paths = {chunk["id"]:str(chunk["id"]) for chunk in initial_chunks}

  for depth in tqdm(range(max_depth), desc="Navigating document"):
    result = route_to_chunks(question, current_chunks, scratchpad, depth)
    scratchpad = result["scratchpad"]
    selected_ids = result["selected_ids"]

    if not selected_ids:
      print("\nNavigation stopped: No relevant chunks selected.")
      final_paragraphs = current_chunks
      break

    selected_chunks = [c for c in current_chunks if c["id"] in selected_ids]

    next_level_chunks = []
    next_chunk_id_counter = 0
    for chunk in selected_chunks:
      parent_path = chunk_paths[chunk["id"]]
      sub_chunks = split_text_into_chunks(chunk['text'], num_chunks=10)

      for i, sub_chunk in enumerate(sub_chunks):
        new_id = next_chunk_id_counter
        sub_chunk["id"] = new_id
        chunk_paths[new_id] = f"{parent_path}.{i}"
        next_level_chunks.append(sub_chunk)
        next_chunk_id_counter += 1

    current_chunks = next_level_chunks
    final_paragraphs = current_chunks

  print(f"\nNavigation finished. Returning {len(final_paragraphs)} retrieved paragraphs.")
  for chunk in final_paragraphs:
        if chunk['id'] in chunk_paths:
             chunk['display_id'] = chunk_paths[chunk['id']]

  return {"paragraphs": final_paragraphs, "scratchpad": scratchpad}

In [None]:
# Run full navigation process

sample_question_1 = "How does the EU AI Act address the situation where an AI system initially classified as “limited risk” evolves — through retraining or integration — into a “high-risk” system, and what specific obligations fall on the deployer versus the provider in this transition?"
sample_question = "What are the requirements for filing a motion to compel discovery, including formatting and signatures?"

metrics_log = []

navigation_result = navigate_document(sample_question, document_chunks, max_depth=2)

print(f"\n--- Navigation Complete ---")
print(f"Retrieved {len(navigation_result['paragraphs'])} paragraphs for synthesis.")

if navigation_result['paragraphs']:
    first_para = navigation_result['paragraphs'][0]
    print(f"\n--- Preview of Retrieved Paragraph {first_para.get('display_id', 'N/A')} ---")
    print(first_para['text'][:500] + "...")
    print("---------------------------------------")

Navigating document:   0%|          | 0/2 [00:00<?, ?it/s]


--- Routing at Depth 0: Evaluating 20 chunks ---
LLM Reasoning: After reviewing the provided text chunks, I've identified the following relevant chunks related to the question "What are the requirements for filing a motion to compel discovery, including formatting and signatures?":

* CHUNK 5: This chunk discusses the filing and service of answers, which is related to discovery. It mentions that an answer must be filed through ESTTA, and if ESTTA is unavailable, a paper form answer may be filed with a Petition to the Director.
* CHUNK 6: This chunk mentions the discovery conference, which is a requirement for parties in Board proceedings. It cites Federal Rule of Civil Procedure 26(f) and notes that a discovery conference is necessary to discuss the scope of discovery.
* CHUNK 8: This chunk discusses the requirements for discovery, including the need to identify the information to be disclosed and the method of service. It cites 37 C.F.R. § 2.120(d) and notes that the responding party

Creating chunks:   0%|          | 0/10 [00:00<?, ?it/s]

Document split into 10 chunks.


Creating chunks:   0%|          | 0/10 [00:00<?, ?it/s]

Document split into 10 chunks.


Creating chunks:   0%|          | 0/10 [00:00<?, ?it/s]

Document split into 10 chunks.

--- Routing at Depth 1: Evaluating 30 chunks ---
LLM Reasoning: After reviewing the new text chunks, I've identified the following relevant chunks related to the question "What are the requirements for filing a motion to compel discovery, including formatting and signatures?":

* CHUNK 10: This chunk discusses the requirements for initial and expert disclosures, which is related to discovery. It cites 37 C.F.R. § 2.120(d) and notes that the responding party must provide the information in a specified format.
* CHUNK 11: This chunk discusses the requirements for discovery conferences, which is related to discovery. It cites 37 C.F.R. § 2.120(a)(2)(i) and Fed. R. Civ. P. 26(f).
* CHUNK 12: This chunk discusses the requirements for responses to requests for production, which is related to discovery. It cites Fed. R. Civ. P. 34(b)(2)(A) and notes that the party to whom the request is directed must respond in writing within 30 days.
* CHUNK 13: This chunk dis

  0%|          | 0/10 [00:00<?, ?it/s]

Document split into 10 chunks.


  0%|          | 0/10 [00:00<?, ?it/s]

Document split into 10 chunks.

Navigation finished. Returning 20 retrieved paragraphs.

--- Navigation Complete ---
Retrieved 20 paragraphs for synthesis.

--- Preview of Retrieved Paragraph 5.8.3.0 ---
10. Fed. R. Civ. P. 26(f)(3)(C). See also Fed. R. Civ. P. 34 Committee Notes on Rules – 2006 Amendment. 11. Frito-Lay North America Inc. v. Princeton Vanguard LLC, 100 USPQ2d 1904, 1905 (TTAB 2011). 12. See generally Zubulake v. UBS Warburg LLC, 220 F.R.D. 212, 216-18 (S.D.N.Y . 2003) (discussing
the obligation to preserve electronically stored information);  Busy Beauty, Inc. v. JPB Group, LLC, 2019
USPQ2d 338392, at *4 (TTAB 2019) (same). 13. Leon v. IDX Systems Corp., 464 F.3d 951, 956 (9th C...
---------------------------------------


## 5. Answer Sythesis

In [None]:
def generate_answer(question: str, paragraphs: List[Dict[str, Any]]) -> Dict[str, Any]:
  print("\n--- Generating final Answer ---")

  if not paragraphs:
    return {"answer": "I could not find relevant information to answer the question.", "citations": []}

  context = "\n\n".join([f"PARAGRAPH {p.get('display_id', p['id'])}:\n{p['text']}" for p in paragraphs])
  system_prompt = """
  You are a legal research assistant. Your task is to answer the user's question based *only* on the provided paragraphs from a legal manual.
  - Synthesize the information from the paragraphs into a clear and concise answer.
  - For every statement you make, you MUST cite the paragraph ID(s) it is based on in parentheses, like (ID: 1.2.5).
  - If the provided paragraphs do not contain enough information, state that clearly.
  - Do not use any external knowledge.
  - Respond with a JSON object containing 'answer' and 'citations' (a list of all unique IDs you cited).
  """
  user_prompt = f"""
  USER QUESTION: "{question}"

  SOURCE PARAGRAPHS:
  {context}

  Please provide your answer in the required JSON format.
  """

  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt},
    ]


  start_time = time.time()
  response = client.chat.completions.create(model=SYNTHESIS_MODEL, messages=messages, temperature=0.0)
  latency = time.time() - start_time

  response_text = response.choices[0].message.content
  p_tokens, c_tokens = response.usage.prompt_tokens, response.usage.completion_tokens
  metrics_log.append({"step": "synthesis", "model": SYNTHESIS_MODEL, "latency_s": latency, "prompt_tokens": p_tokens, "completion_tokens": c_tokens, "total_tokens": p_tokens + c_tokens})

  parsed_output = parse_json_from_response(response_text)
  return{
    "answer": parsed_output.get("answer", "Failed to generate a valid answer."),
    "citations": sorted(list(set(parsed_output.get("citations", []))))
  }


In [None]:
final_answer_result = generate_answer(
    sample_question,
    navigation_result['paragraphs']
)

print("\n--- GENERATED ANSWER ---")
print(final_answer_result['answer'])
print("\n--- CITATIONS ---")
print(final_answer_result['citations'])


--- Generating final Answer ---

--- GENERATED ANSWER ---
To file a motion to compel discovery, the moving party must make a good faith effort to resolve the discovery dispute prior to seeking Board intervention (5.8.7.7, 5.8.7.8). The motion must be supported by a showing that such a good faith effort was made by conference or correspondence (5.8.7.7, 5.8.7.8). The motion to compel must be signed by at least one attorney of record or the party personally, and must state the signer's address, email address, and telephone number (5.8.7.6). The signature constitutes a certification that the motion is warranted, consistent with the Federal Rules of Civil Procedure, and not unreasonable or unduly burdensome (5.8.7.7). If issues raised in the motion are subsequently resolved by agreement of the parties, the moving party should inform the Board in writing (5.8.7.7). In terms of formatting, the motion should include a copy of the set(s) of requests which together are said to exceed the limit

## 6. Qualitative Evaluation by LLM Judge

In [None]:
def evaluate_faithfulness(question: str, answer: str, citations: List[str], paragraphs: List[Dict[str, Any]]) -> Dict[str, Any]:
  print("\n--- Evaluating Faithfulness ---")

  if not citations or not answer:
    return {"is_faithful": False, "explanation": "No answer or citations provided."}

  cited_paragraphs = [p for p in paragraphs if p.get('display_id') in citations]
  if not cited_paragraphs:
    return {"is_faithful": False, "explanation": f"Cited IDs {citations} not found."}

  context = "\n\n".join([f"Paragraph {p['display_id']}:\n{p['text']}" for p in cited_paragraphs])

  prompt = f"""
  You are a meticulous fact-checker. Determine if the 'ANSWER' is fully supported by the 'SOURCE PARAGRAPHS'.
  The answer is 'faithful' only if every single piece of information it contains is directly stated or logically derived from the source paragraphs.

  QUESTION: "{question}"
  ANSWER TO VERIFY: "{answer}"
  SOURCE PARAGRAPHS:
  {context}

  Respond with a JSON object: {{"is_faithful": boolean, "explanation": "brief reasoning"}}.
  """

  start_time = time.time()
  response = client.chat.completions.create(model=EVALUATION_MODEL, messages=[{"role": "user", "content": prompt}], temperature=0.0)
  latency = time.time()-start_time

  response_text = response.choices[0].message.content
  p_tokens, c_tokens = response.usage.prompt_tokens, response.usage.completion_tokens
  metrics_log.append({"step": "eval_faithfulness", "model": EVALUATION_MODEL, "latency_s": latency, "prompt_tokens": p_tokens, "completion_tokens": c_tokens, "total_tokens": p_tokens + c_tokens})

  return parse_json_from_response(response_text)

In [None]:
def evaluate_answer_relevance(question:str, answer:str) -> Dict[str, Any]:
  print("\n--- Evaluating Answer Relevance ---")
  prompt = f"""
  Score how well the 'ANSWER' addresses the 'ORIGINAL QUESTION' on a scale from 0.0 to 1.0.
  - A score of 1.0 means the answer completely and directly answers the question.
  - A score of 0.0 means the answer is completely irrelevant.

  ORIGINAL QUESTION: "{question}"
  ANSWER: "{answer}"

  Respond with a JSON object: {{"score": float, "justification": "brief reasoning"}}.
  """

  start_time = time.time()
  response = client.chat.completions.create(model=EVALUATION_MODEL, messages=[{"role": "user", "content": prompt}], temperature=0.0)
  latency = time.time() - start_time

  response_text = response.choices[0].message.content
  p_tokens, c_tokens = response.usage.prompt_tokens, response.usage.completion_tokens
  metrics_log.append({"step": "eval_answer_relevance", "model": EVALUATION_MODEL, "latency_s": latency, "prompt_tokens": p_tokens, "completion_tokens": c_tokens, "total_tokens": p_tokens + c_tokens})

  parsed = parse_json_from_response(response_text)
  return {"score": parsed.get("score", 0.0), "justification": parsed.get("justification", "")}


In [None]:
def evaluate_retrieval_relevance(question: str, paragraphs: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Scores the relevance of the retrieved documents to the question.
    """
    print("\n--- Evaluating Retrieval Relevance ---")
    context = "\n\n".join([f"PARAGRAPH {p.get('display_id', p['id'])}:\n{p['text'][:500]}..." for p in paragraphs])

    prompt = f"""
    Score how relevant the provided 'RETRIEVED PARAGRAPHS' are for answering the 'ORIGINAL QUESTION' on a scale from 0.0 to 1.0.
    - A score of 1.0 means the paragraphs contain all the necessary information.
    - A score of 0.0 means the paragraphs are completely irrelevant.

    ORIGINAL QUESTION: "{question}"
    RETRIEVED PARAGRAPHS:
    {context}

    Respond with a JSON object: {{"score": float, "justification": "brief reasoning"}}.
    """

    start_time = time.time()
    response = client.chat.completions.create(model=EVALUATION_MODEL, messages=[{"role": "user", "content": prompt}], temperature=0.0)
    latency = time.time() - start_time

    response_text = response.choices[0].message.content
    p_tokens, c_tokens = response.usage.prompt_tokens, response.usage.completion_tokens
    metrics_log.append({"step": "eval_retrieval_relevance", "model": EVALUATION_MODEL, "latency_s": latency, "prompt_tokens": p_tokens, "completion_tokens": c_tokens, "total_tokens": p_tokens + c_tokens})

    parsed = parse_json_from_response(response_text)
    return {"score": parsed.get("score", 0.0), "justification": parsed.get("justification", "")}

In [None]:
# Run all qualitative evaluations
faithfulness_result = evaluate_faithfulness(
    sample_question,
    final_answer_result['answer'],
    final_answer_result['citations'],
    navigation_result['paragraphs']
)

answer_relevance_result = evaluate_answer_relevance(
    sample_question,
    final_answer_result['answer']
)

retrieval_relevance_result = evaluate_retrieval_relevance(
    sample_question,
    navigation_result['paragraphs']
)

print("\n--- QUALITATIVE EVALUATION SUMMARY ---")
print(f"Faithfulness Check: {'PASSED' if faithfulness_result.get('is_faithful') else 'FAILED'}")
print(f"  -> Explanation: {faithfulness_result.get('explanation')}")
print(f"Answer Relevance Score: {answer_relevance_result.get('score'):.2f}")
print(f"  -> Justification: {answer_relevance_result.get('justification')}")
print(f"Retrieval Relevance Score: {retrieval_relevance_result.get('score'):.2f}")
print(f"  -> Justification: {retrieval_relevance_result.get('justification')}")


--- Evaluating Faithfulness ---

--- Evaluating Answer Relevance ---

--- Evaluating Retrieval Relevance ---

--- QUALITATIVE EVALUATION SUMMARY ---
Faithfulness Check: PASSED
  -> Explanation: The ANSWER accurately reflects the requirements for filing a motion to compel discovery as stated in the SOURCE PARAGRAPHS, including the need for a good faith effort to resolve disputes, the signature and certification requirements, and the formatting details such as including a copy of the requests and setting out the counting method.
Answer Relevance Score: 0.90
  -> Justification: The answer thoroughly addresses the requirements for filing a motion to compel discovery, including the need for a good faith effort, signatures, and specific formatting details. However, it references specific sections (e.g., 5.8.7.7, 5.8.3.5) without clarifying their source or jurisdiction, which could slightly reduce clarity for some readers.
Retrieval Relevance Score: 0.20
  -> Justification: The retrieved par

## 7. Final Analysis and Summary

In [None]:
model_prices_per_million_tokens = {
    "meta-llama/Meta-Llama-3.1-8B-Instruct": {
        "input": 0.02,
        "output": 0.06
    },
    "meta-llama/Llama-3.3-70B-Instruct": {
        "input": 0.13,
        "output": 0.40
    },
    "deepseek-ai/DeepSeek-V3": {
        "input": 0.50,
        "output": 1.50
    }
}

In [None]:
if metrics_log:
    df_metrics = pd.DataFrame(metrics_log)

    def calculate_cost(row):
        model_name = row['model']
        prices = model_prices_per_million_tokens.get(model_name, {"input": 0, "output": 0})
        input_cost = (row['prompt_tokens'] / 1_000_000) * prices['input']
        output_cost = (row['completion_tokens'] / 1_000_000) * prices['output']
        return input_cost + output_cost

    df_metrics['cost_usd'] = df_metrics.apply(calculate_cost, axis=1)

    print("--- Per-Step Performance and Cost Analysis ---")
    print(df_metrics.to_string())
else:
    print("No metrics were logged.")

--- Per-Step Performance and Cost Analysis ---
                       step                                  model  latency_s  prompt_tokens  completion_tokens  total_tokens  cost_usd
0      route_depth_0_reason  meta-llama/Meta-Llama-3.1-8B-Instruct   9.868701           6139                380          6519  0.000146
1      route_depth_0_select  meta-llama/Meta-Llama-3.1-8B-Instruct   1.168443           6523                 18          6541  0.000132
2      route_depth_1_reason  meta-llama/Meta-Llama-3.1-8B-Instruct   9.128371           9503                461          9964  0.000218
3      route_depth_1_select  meta-llama/Meta-Llama-3.1-8B-Instruct   1.521100           9581                 15          9596  0.000193
4                 synthesis      meta-llama/Llama-3.3-70B-Instruct  11.045084           8508                333          8841  0.001239
5         eval_faithfulness                deepseek-ai/DeepSeek-V3   4.381199           3028                 80          3108  0.001634
6

In [None]:
if metrics_log:
    # Calculate totals from the detailed metrics log
    total_latency = df_metrics['latency_s'].sum()
    total_cost = df_metrics['cost_usd'].sum()
    total_tokens = df_metrics['total_tokens'].sum()

    # Get qualitative scores
    faithfulness_score = 1.0 if faithfulness_result.get('is_faithful') else 0.0
    answer_relevance_score = answer_relevance_result.get('score', 0.0)
    retrieval_relevance_score = retrieval_relevance_result.get('score', 0.0)

    # Calculate a simple overall confidence score
    overall_confidence = faithfulness_score * answer_relevance_score * retrieval_relevance_score

    # Create summary dictionary
    summary_data = {
        'question': [sample_question],
        'total_latency_s': [total_latency],
        'total_cost_usd': [total_cost],
        'total_tokens': [total_tokens],
        'faithfulness_check': ['PASSED' if faithfulness_score == 1.0 else 'FAILED'],
        'answer_relevance_score': [answer_relevance_score],
        'retrieval_relevance_score': [retrieval_relevance_score],
        'overall_confidence_score': [overall_confidence]
    }

    df_summary = pd.DataFrame(summary_data)

    print("--- Final Query Summary ---")
    # Transpose for better readability of a single-row summary
    print(df_summary.T.rename(columns={0: 'Result'}))
else:
    print("Cannot generate summary as no metrics were logged.")

--- Final Query Summary ---
                                                                      Result
question                   What are the requirements for filing a motion ...
total_latency_s                                                    45.526567
total_cost_usd                                                      0.005589
total_tokens                                                           48254
faithfulness_check                                                    PASSED
answer_relevance_score                                                   0.9
retrieval_relevance_score                                                0.2
overall_confidence_score                                                0.18
