### Setup

In [1]:
import json
import time
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import Any, Optional
from tqdm import tqdm

from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import init_chat_model

from trulens.core import TruSession, Select, Feedback, Provider
from trulens.core.instruments import instrument
from trulens.apps.app import TruApp
from trulens.dashboard.run import run_dashboard

from ragapp.rag.med_agent_graph import MedTechAgent, DeviceEnum
from constants import (
    TOGETHER_META_LLAMA_70B_FREE,
    GROQ_LLAMA_90B,
    GROQ_GEMMA_9B,
    GROUND_TRUTH_ACTUAL_JSON,
    EMBEDDINGS_MODEL_NAME,
    LP_PLUMBER_CACHE_DIR,
    LP_DOCLING_CACHE_DIR,
    LP_PLUMBER_COLLECTION_NAME,
    LP_DOCLING_COLLECTION_NAME,
)

In [2]:
load_dotenv()

True

### Load Data && Storages && Agents

In [3]:
with open(GROUND_TRUTH_ACTUAL_JSON, "r", encoding="utf-8") as file:
    ground_truth_actual_data = json.load(file)

In [4]:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)

docling_storage = Chroma(
    collection_name=LP_DOCLING_COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=LP_DOCLING_CACHE_DIR,
)
plumber_storage = Chroma(
    collection_name=LP_PLUMBER_COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=LP_PLUMBER_CACHE_DIR,
)

In [5]:
chat_mode_name = TOGETHER_META_LLAMA_70B_FREE
device_model_name = GROQ_GEMMA_9B

plumber_med_agent = MedTechAgent(
    vector_storage=plumber_storage,
    rag_model_name=chat_mode_name,
    device_model_name=device_model_name,
)

docling_med_agent = MedTechAgent(
    vector_storage=docling_storage,
    rag_model_name=chat_mode_name,
    device_model_name=device_model_name,
)

### Specify Eval Samples

In [6]:
def explore_sample(storage, sample, device=None, search_pattern: str = None):
    print("Question:", sample["question"], "\n\n")
    print("Ground Truth:\n", sample["ground_truth"], "\n\n\n")

    get_res = storage.get(
        where_document={"$contains": search_pattern}, where={"device": device}
    )

    n_docs = len(get_res["ids"])
    print("#" * 20, f"DOCUMENTS FOUND ({n_docs})", "#" * 20)
    print(f"\t search pattern: '{search_pattern}'")
    print("\t device filter:", device)
    for i, doc in enumerate(get_res["documents"], start=1):
        print("#" * 20, f"Doc {i}", "#" * 20)
        print(doc)

    print("#" * 20, "END DOCUMENTS:", "#" * 20)

    start_point = sample["ground_truth"].rfind(search_pattern)

    print("\n\n\nSEARCH PATTERN EDGES IN GROUND TRUTH:")
    print(f"\t Pattern edges: {start_point}:{start_point + len(search_pattern)}")
    print(f"\t Pattern: '{search_pattern}'")
    print("\nDOCS: [ pdf title | page ]")
    for i, doc_metadata in enumerate(get_res["metadatas"], start=1):
        print("\t", doc_metadata["pdf_title"], "|", doc_metadata["page"])

In [7]:
# eval_ids = [0, 1, 10, 11, 8]

## sample 10
explore_sample(
    storage=docling_storage,
    sample=ground_truth_actual_data[10],
    device=DeviceEnum.lifepak_20,
    search_pattern=ground_truth_actual_data[10]["ground_truth"][89:111],
)

## sample 0
# explore_sample(
#     storage=docling_storage,
#     sample=ground_truth_actual_data[0],
#     device=DeviceEnum.lifepak_15,
#     search_pattern=ground_truth_actual_data[0]["ground_truth"][26:118],
# )

Question: How do I troubleshoot low volume on Lifepak 20? 


Ground Truth:
 Possible Cause and Corrective Action 
A Possible cause of low speaker volume is moisture in speaker grill holes. To resolve, wipe moisture from speaker grill and allow device to dry. 



#################### DOCUMENTS FOUND (1) ####################
	 search pattern: 'in speaker grill holes'
	 device filter: DeviceEnum.lifepak_20
#################### Doc 1 ####################
GENERAL TROUBLESHOOTING TIPS
Table 7-2 General Troubleshooting Tips
GLYPH<129> Refer to Section 4, page 4-20.. 9 Problems with pacing., Possible Cause = . 9 Problems with pacing., Corrective Action = GLYPH<129> Refer to Section 4, page 4-22.. 10 Displayed time is incorrect., Possible Cause = Time is incorrectly set.. 10 Displayed time is incorrect., Corrective Action = GLYPH<129> Change the time setting. Refer to Section 2, page 2-7.. 11 Date printed on report is incorrect., Possible Cause = Date is incorrectly set.. 11 Date printed on rep

In [8]:
test = [
    {
        "question": ground_truth_actual_data[0]["question"],
        "ground_truth": ground_truth_actual_data[0]["ground_truth"],
        "search_pattern": ground_truth_actual_data[0]["ground_truth"][26:118],
        "device": DeviceEnum.lifepak_15,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_LIFEPAK_15_Monitor_Defibrillator_3314911_036_202105.pdf",
                "pages": [37],
            }
        ],
    },
    {
        "question": ground_truth_actual_data[1]["question"],
        "ground_truth": ground_truth_actual_data[1]["ground_truth"],
        "search_pattern": ground_truth_actual_data[1]["ground_truth"][34:57],
        "device": DeviceEnum.lifepak_15,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_LIFEPAK_15_Monitor_Defibrillator_3314911_036_202105.pdf",
                "pages": [90],
            }
        ],
    },
    {
        "question": ground_truth_actual_data[10]["question"],
        "ground_truth": ground_truth_actual_data[10]["ground_truth"],
        "search_pattern": ground_truth_actual_data[10]["ground_truth"][89:111],
        "device": DeviceEnum.lifepak_20,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3200750_039_201404_eq00.pdf",
                "pages": [110],
            },
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3313180_008_201508.pdf",
                "pages": [123],
            },
        ],
    },
    {
        "question": ground_truth_actual_data[11]["question"],
        "ground_truth": ground_truth_actual_data[11]["ground_truth"],
        "search_pattern": ground_truth_actual_data[11]["ground_truth"][239:267],
        "device": DeviceEnum.lifepak_20,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3200750_039_201404_eq00.pdf",
                "pages": [65],
            },
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3313180_008_201508.pdf",
                "pages": [74],
            },
        ],
    },
]

In [9]:
## Asserts that the theoretical document metadata matches the metadata retrieved from vector storage.

for t in test:
    for d in t["docs"]:
        for page in d["pages"]:
            where_filter = {
                "$and": [
                    {"device": t["device"].value},
                    {"pdf_title": d["pdf_title"]},
                    {"page": page},
                ]
            }
            # todo: check also `docling_storage`
            res = plumber_storage.get(
                where_document={"$contains": t["search_pattern"]},
                where=where_filter,
            )
            assert len(res["ids"]) == 1

### Run Evaluation

In [10]:
session = TruSession()
session.reset_database()

🦑 Initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `TruSession` to prevent this.


Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]


In [11]:
instrument.method(MedTechAgent, "run")

In [12]:
system_prompt_answer_relevance = """
You are an expert evaluator for medical device technical documentation. Your task is to assess the accuracy and completeness of an AI-generated response compared to the provided ground truth answer. You will assign a score from 0 to 3 based on the given criteria.

### **Input Variables:**
- **Question**: The user's query asking about a medical device's technical details or instructions.
- **Ground Truth Answer**: The expected correct response, usually a structured set of instructions with bullet points or numbered steps.
- **Answer**: The AI-generated response that needs to be evaluated.

### **Scoring Criteria:**
- **0 (Incorrect Answer)**: The **Answer** is mostly or entirely incorrect. It may be unrelated to the **Question**, reference the wrong instructions, or address a different process.
- **1 (Partially Correct)**: The **Answer** contains some correct points but is incomplete or includes incorrect elements. It may also be **completely unstructured**, making it difficult to follow.
- **2 (Mostly Correct but Unstructured)**: The **Answer** is factually correct and includes all required points from the **Ground Truth Answer**, but it is not formatted properly (e.g., lacks bullet points, numbering, or section headers).
- **3 (Fully Correct and Well-Structured)**: The **Answer** is entirely correct, contains all required points, and follows the same structured format as the **Ground Truth Answer** (e.g., numbered steps, bullet points, section headers).

### **Evaluation Steps:**
1. Compare the **content accuracy** of the **Answer** to the **Ground Truth Answer**.
2. Check whether all key points from the **Ground Truth Answer** are included in the **Answer**.
3. Assess the **structure and formatting** (e.g., bullet points, numbered lists, sections).
4. Assign a score from 0 to 3 based on the above criteria.

### **Output Format:**
Provide your evaluation as follows:
- **Score:** [0-3]
"""
# todo: add support for returning reasoning
# - **Reasoning:** Explain why you assigned this score, highlighting missing, incorrect, or unstructured elements.

system_prompt_hallucination = """
You are an expert evaluator of technical documentation for medical devices. Your task is to assess whether an AI-generated response hallucinates information—that is, whether it introduces content not supported by the provided context. Assign a score from 0 to 2 based on the criteria below.

### Input Variables:
- **Relevant Context**: Retrieved content from the documentation database that the AI had access to when generating its response.
- **Answer**: The AI-generated response to be evaluated for hallucination.

### Scoring Criteria:
- **0 — Hallucinated Answer**: The **Answer** is not based on the **Relevant Context** and includes fabricated or unsupported content.
- **1 — Partially Grounded**: The **Answer** is partially based on the **Relevant Context**, but includes additional details, reworded instructions, or differs in structure.
- **2 — Fully Grounded**: The **Answer** is entirely consistent with the **Relevant Context**, both in content and structure.

### Special Cases:
- If no **Relevant Context** is provided and the AI clearly states that it cannot answer due to a lack of information, score **2**.
- If no **Relevant Context** is provided and the AI still attempts to answer, score **0**.

### **Output Format:**
Provide your evaluation as follows:
- **Score:** [0-2]
"""

user_prompt_hallucination = """
Relevant Context: {context}

Answer: {answer}
"""

user_prompt_answer_relevance = """
Question: {question}

Ground Truth Answer: {ground_truth_answer}


Answer: {answer}
"""

In [13]:
class StandAlone(Provider):
    class DocMeta(BaseModel):
        """Doc metadata used for matching AI responses with test docs."""

        pdf_title: str = Field(description="PDF title.")
        page: int = Field(description="Page number.")
        device: DeviceEnum = Field(description="Device type.")
        content: Optional[str] = Field(default=None)

        def __hash__(self):
            return hash((self.pdf_title, self.page, self.device))

        def __eq__(self, other):
            return (
                self.pdf_title == other.pdf_title
                and self.page == other.page
                and self.device == other.device
            )

    class EvaluatorLLMSchema(BaseModel):
        score: float = Field(default=None, description="The metric score of answer.")
        # reasoning: str = Field(description="The reasoning of score.")  # todo: add support for returning reasoning

    answer_relevance_llm: Any = Field(default=None)
    is_docling_storage: bool = Field(default=False)

    def __init__(self, answer_relevance_llm, is_docling_storage=False):
        super().__init__()
        self.answer_relevance_llm = answer_relevance_llm
        self.is_docling_storage = is_docling_storage

    def context_provided(self, response: dict) -> float:
        response_doc_meta = self._get_docs_meta(response)
        actual_doc_meta = self._get_actual_doc_meta(response["state_kwargs"])

        doc_contains = any([data in actual_doc_meta for data in response_doc_meta])
        return doc_contains

    def hallucination(self, response: dict) -> float:
        docs_metadata = self._get_docs_meta(response)
        actual_docs_metadata = self._get_actual_doc_meta(response["state_kwargs"])

        relevant_content = "\n".join(
            [
                doc_meta.content
                for doc_meta in docs_metadata
                if doc_meta in actual_docs_metadata
            ]
        )
        relevant_content = relevant_content or "None"

        user_prompt_formatted = user_prompt_hallucination.format(
            context=relevant_content,
            answer=response["retrieval_result"]["answer"],
        )

        llm_inputs = [
            {"role": "system", "content": system_prompt_hallucination},
            {"role": "user", "content": user_prompt_formatted},
        ]
        score = self.answer_relevance_llm.invoke(llm_inputs).score
        return score / 2

    def answer_relevance(self, response: dict) -> float:
        user_prompt_formatted = user_prompt_answer_relevance.format(
            question=response["question"],
            ground_truth_answer=response["state_kwargs"]["ground_truth"],
            answer=response["retrieval_result"]["answer"],
        )

        llm_inputs = [
            {"role": "system", "content": system_prompt_answer_relevance},
            {"role": "user", "content": user_prompt_formatted},
        ]

        score = self.answer_relevance_llm.invoke(llm_inputs).score
        return score / 3

    def _get_docs_meta(self, response: dict) -> set[DocMeta]:
        context_docs = [doc for doc in response["context_documents"]]
        docs_metadata = set()
        for doc in context_docs:
            dm = doc["metadata"]
            doc_meta = self.DocMeta(
                pdf_title=dm["pdf_title"],
                page=int(dm["page"]),
                device=dm["device"],
                content=doc["page_content"],
            )
            docs_metadata.add(doc_meta)
        return docs_metadata

    def _get_actual_doc_meta(self, state_kwargs) -> set[DocMeta]:
        actual_docs_metadata = set()
        for doc in state_kwargs["docs"]:
            for p in doc["pages"]:
                doc_page = int(p)
                if self.is_docling_storage:
                    doc_page += 1  # issue with Docling page offset
                device_title_page = self.DocMeta(
                    pdf_title=doc["pdf_title"],
                    page=doc_page,
                    device=state_kwargs["device"],
                )
                actual_docs_metadata.add(device_title_page)
        return actual_docs_metadata

In [14]:
eval_llm = init_chat_model(model=GROQ_LLAMA_90B, temperature=0.0)
eval_llm_structured = eval_llm.with_structured_output(StandAlone.EvaluatorLLMSchema)

In [15]:
### Test Answer Relevance Metric ###

# user_prompt_formatted = user_prompt_answer_relevance.format(
#     question=test[0]["question"],
#     ground_truth_answer=test[0]["ground_truth"],
#     answer="To replace the battery on the Lifepak 15, first confirm that the new battery is fully charged. Then, inspect the battery pins and contacts in the battery wells for signs of damage. Next, align the new battery so the battery clip is over the pins in the battery well, insert the end of the battery opposite the clip into the well, and press the clip end into the well until it clicks into place. It is recommended to replace batteries approximately every two years or when they show signs of damage or reduced capacity."
# )
#
#
# llm_inputs = [
#     {"role": "system", "content": system_prompt_answer_relevance},
#     {"role": "user", "content": user_prompt_formatted},
# ]
#
# res = eval_llm_structured.invoke(llm_inputs)
# res

In [16]:
def form_feedbacks(provider: StandAlone):
    f_context_relevance = Feedback(
        provider.context_provided, name="Context Relevance"
    ).on(Select.RecordCalls.run.rets)

    f_answer_relevance = Feedback(
        provider.answer_relevance, name="Answer Relevance"
    ).on(Select.RecordCalls.run.rets)

    f_hallucination = Feedback(provider.hallucination, name="Hallucination").on(
        Select.RecordCalls.run.rets
    )
    return [f_context_relevance, f_answer_relevance, f_hallucination]

In [17]:
def evaluate_agent(agent: MedTechAgent, eval_recorder_kwargs: dict):
    tru_recorder = TruApp(agent, **eval_recorder_kwargs)

    for sample in tqdm(test):
        inputs = {
            "question": sample["question"],
            "k": 3,
            "ground_truth": sample["ground_truth"],
            "device": sample["device"],
            "docs": sample["docs"],
        }

        ## Respect API rate limits
        # start = time.time()

        with tru_recorder as recording:
            _ = agent.run(**inputs)

        # time_sleep = 30 - (time.time() - start)
        # if time_sleep > 0:
        #     time.sleep(time_sleep)

In [18]:
plumber_provider = StandAlone(eval_llm_structured)

plumber_recorder_kwargs = {
    "app_name": "RAG App Plumber",
    "app_version": "0.1.0",
    "feedbacks": form_feedbacks(plumber_provider),
}
docling_recorder_kwargs = {
    "app_name": "RAG App Docling",
    "app_version": "0.1.0",
    "feedbacks": form_feedbacks(
        StandAlone(eval_llm_structured, is_docling_storage=True)
    ),
}

✅ In Context Relevance, input response will be set to __record__.app.run.rets .
✅ In Answer Relevance, input response will be set to __record__.app.run.rets .
✅ In Hallucination, input response will be set to __record__.app.run.rets .
✅ In Context Relevance, input response will be set to __record__.app.run.rets .
✅ In Answer Relevance, input response will be set to __record__.app.run.rets .
✅ In Hallucination, input response will be set to __record__.app.run.rets .


In [19]:
evaluate_agent(plumber_med_agent, plumber_recorder_kwargs)

instrumenting <class 'ragapp.rag.med_agent_graph.MedTechAgent'> for base <class 'ragapp.rag.med_agent_graph.MedTechAgent'>
	instrumenting run


100%|██████████| 4/4 [00:38<00:00,  9.64s/it]


In [20]:
evaluate_agent(docling_med_agent, docling_recorder_kwargs)

instrumenting <class 'ragapp.rag.med_agent_graph.MedTechAgent'> for base <class 'ragapp.rag.med_agent_graph.MedTechAgent'>
	instrumenting run


100%|██████████| 4/4 [00:30<00:00,  7.70s/it]


In [21]:
session.get_leaderboard()

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer Relevance,Context Relevance,Hallucination,latency,total_cost
app_name,app_version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RAG App Docling,0.1.0,0.888889,0.75,1.0,7.604631,0.0
RAG App Plumber,0.1.0,0.5,0.5,0.375,9.535199,0.0
