### Setup

In [1]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.getLogger("trulens").setLevel(logging.DEBUG)

In [2]:
import json
import time
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import Any, Optional, Callable
from tqdm import tqdm

from langchain_core.documents import Document
from langchain_core.runnables import Runnable
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import init_chat_model

from trulens.core import TruSession, Select, Feedback, Provider
from trulens.core.instruments import instrument
from trulens.apps.app import TruApp
from trulens.dashboard.run import run_dashboard

from ragapp.rag.med_agent_graph import (
    MedTechAgent,
    DeviceEnum,
    AgentQueryState,
    RelevantDocumentSet,
)
from constants import (
    GROQ_MISTRAL_24B,
    TOGETHER_META_LLAMA_70B_FREE,
    GROQ_LLAMA_SCOUT_17B,
    GROQ_DEEPSEEK,
    GROQ_QWEN_32B,
    GROQ_LLAMA_70B,
    GROUND_TRUTH_ACTUAL_JSON,
    EMBEDDINGS_MODEL_NAME,
    LP_PLUMBER_CACHE_DIR,
    LP_DOCLING_CACHE_DIR,
    LP_PLUMBER_COLLECTION_NAME,
    LP_DOCLING_COLLECTION_NAME,
)

In [3]:
load_dotenv()

True

### Load Data && Storages && Agents

In [4]:
with open(GROUND_TRUTH_ACTUAL_JSON, "r", encoding="utf-8") as file:
    ground_truth_actual_data = json.load(file)

In [5]:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)

docling_storage = Chroma(
    collection_name=LP_DOCLING_COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=LP_DOCLING_CACHE_DIR,
)
plumber_storage = Chroma(
    collection_name=LP_PLUMBER_COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory=LP_PLUMBER_CACHE_DIR,
)

In [6]:
model_names = dict(
    query_check_model=GROQ_QWEN_32B,
    device_classifier_model=GROQ_QWEN_32B,
    paraphraser_model=GROQ_LLAMA_SCOUT_17B,
    relevance_selector_model=TOGETHER_META_LLAMA_70B_FREE,
    answer_generator_model=GROQ_LLAMA_70B,
    error_handler_model=GROQ_LLAMA_70B,
)
k_value = 5

plumber_med_agent = MedTechAgent(
    vector_storage=plumber_storage, k=k_value, **model_names
)
docling_med_agent = MedTechAgent(
    vector_storage=docling_storage, k=k_value, **model_names
)

### Specify Eval Samples

In [7]:
def explore_sample(storage, sample, device=None, search_pattern: str = None):
    print("Question:", sample["question"], "\n\n")
    print("Ground Truth:\n", sample["ground_truth"], "\n\n\n")

    get_res = storage.get(
        where_document={"$contains": search_pattern}, where={"device": device}
    )

    n_docs = len(get_res["ids"])
    print("#" * 20, f"DOCUMENTS FOUND ({n_docs})", "#" * 20)
    print(f"\t search pattern: '{search_pattern}'")
    print("\t device filter:", device)
    for i, doc in enumerate(get_res["documents"], start=1):
        print("#" * 20, f"Doc {i}", "#" * 20)
        print(doc)

    print("#" * 20, "END DOCUMENTS:", "#" * 20)

    start_point = sample["ground_truth"].rfind(search_pattern)

    print("\n\n\nSEARCH PATTERN EDGES IN GROUND TRUTH:")
    print(f"\t Pattern edges: {start_point}:{start_point + len(search_pattern)}")
    print(f"\t Pattern: '{search_pattern}'")
    print("\nDOCS: [ pdf title | page ]")
    for i, doc_metadata in enumerate(get_res["metadatas"], start=1):
        print("\t", doc_metadata["pdf_title"], "|", doc_metadata["page"])

In [8]:
ground_truth_actual_data[10]["ground_truth"][89:111]

'in speaker grill holes'

In [9]:
## Docling missed one item from the table:

# doc = docling_storage.get(
#     where={
#         "$and": [
#             {"device": DeviceEnum.lifepak_20},
#             {"pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3313180_008_201508.pdf"},
#             {"page": "123"}
#         ]
#     }
# )
#
# print(doc["documents"][-2])

In [10]:
# eval_ids = [0, 1, 10, 11, 8]

## sample 10
explore_sample(
    storage=docling_storage,
    sample=ground_truth_actual_data[10],
    device=DeviceEnum.lifepak_20,
    search_pattern=ground_truth_actual_data[10]["ground_truth"][89:111],
)

## sample 0
# explore_sample(
#     storage=docling_storage,
#     sample=ground_truth_actual_data[0],
#     device=DeviceEnum.lifepak_15,
#     search_pattern=ground_truth_actual_data[0]["ground_truth"][26:118],
# )

Question: How do I troubleshoot low volume on Lifepak 20? 


Ground Truth:
 Possible Cause and Corrective Action 
A Possible cause of low speaker volume is moisture in speaker grill holes. To resolve, wipe moisture from speaker grill and allow device to dry. 



#################### DOCUMENTS FOUND (1) ####################
	 search pattern: 'in speaker grill holes'
	 device filter: DeviceEnum.lifepak_20
#################### Doc 1 ####################
GENERAL TROUBLESHOOTING TIPS
Table 7-2 General Troubleshooting Tips
GLYPH<129> Refer to Section 4, page 4-20.. 9 Problems with pacing., Possible Cause = . 9 Problems with pacing., Corrective Action = GLYPH<129> Refer to Section 4, page 4-22.. 10 Displayed time is incorrect., Possible Cause = Time is incorrectly set.. 10 Displayed time is incorrect., Corrective Action = GLYPH<129> Change the time setting. Refer to Section 2, page 2-7.. 11 Date printed on report is incorrect., Possible Cause = Date is incorrectly set.. 11 Date printed on rep

In [11]:
test = [
    {
        "question": ground_truth_actual_data[0]["question"],
        "ground_truth": ground_truth_actual_data[0]["ground_truth"],
        "search_pattern": ground_truth_actual_data[0]["ground_truth"][26:118],
        "device": DeviceEnum.lifepak_15,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_LIFEPAK_15_Monitor_Defibrillator_3314911_036_202105.pdf",
                "pages": [37],
            }
        ],
    },
    {
        "question": ground_truth_actual_data[1]["question"],
        "ground_truth": ground_truth_actual_data[1]["ground_truth"],
        "search_pattern": ground_truth_actual_data[1]["ground_truth"][34:57],
        "device": DeviceEnum.lifepak_15,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_LIFEPAK_15_Monitor_Defibrillator_3314911_036_202105.pdf",
                "pages": [90],
            }
        ],
    },
    {
        "question": ground_truth_actual_data[10]["question"],
        "ground_truth": ground_truth_actual_data[10]["ground_truth"],
        "search_pattern": ground_truth_actual_data[10]["ground_truth"][89:111],
        "device": DeviceEnum.lifepak_20,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3200750_039_201404_eq00.pdf",
                "pages": [110],
            },
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3313180_008_201508.pdf",
                "pages": [123],
            },
        ],
    },
    {
        "question": ground_truth_actual_data[11]["question"],
        "ground_truth": ground_truth_actual_data[11]["ground_truth"],
        "search_pattern": ground_truth_actual_data[11]["ground_truth"][239:267],
        "device": DeviceEnum.lifepak_20,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3200750_039_201404_eq00.pdf",
                "pages": [65],
            },
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3313180_008_201508.pdf",
                "pages": [74],
            },
        ],
    },
]

In [12]:
## Asserts that the theoretical document metadata matches the metadata retrieved from vector storage.
storages = {
    "docling": docling_storage,
    "plumber": plumber_storage,
}

for t in test:
    test_docs = t["docs"]
    for i_test, d in enumerate(test_docs, start=1):
        for page in d["pages"]:
            where_filter = {
                "$and": [
                    {"device": t["device"].value},
                    {"pdf_title": d["pdf_title"]},
                    {"page": page},
                ]
            }
            for storage_name, storage in storages.items():
                res = storage.get(
                    where_document={"$contains": t["search_pattern"]},
                    where=where_filter,
                )
                try:
                    assert len(res["ids"]) == 1
                    print(f"assert ({i_test}/{len(test_docs)})")
                except AssertionError:
                    print(f"\nAssertionError ({i_test}/{len(test_docs)}):")
                    print("Storage name:", storage_name)
                    print("pdf_title:", d["pdf_title"])
                    print("device:", t["device"].value)
                    print("page:", page)
                    print("content:", t["search_pattern"], "\n")

assert (1/1)
assert (1/1)
assert (1/1)
assert (1/1)
assert (1/2)
assert (1/2)

AssertionError (2/2):
Storage name: docling
pdf_title: Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3313180_008_201508.pdf
device: LIFEPAK 20
page: 123
content: in speaker grill holes 

assert (2/2)
assert (1/2)
assert (1/2)
assert (2/2)
assert (2/2)


### Run Evaluation

In [88]:
session = TruSession()
session.reset_database()

Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]


In [14]:
instrument.method(MedTechAgent, "run")

In [89]:
system_prompt_answer_relevance = """
You are an expert evaluator for medical device technical documentation. Your task is to assess the accuracy and completeness of an AI-generated response compared to the provided ground truth answer. You will assign a score from 0 to 3 based on the given criteria.

### **Input Variables:**
- **Question**: The user's query asking about a medical device's technical details or instructions.
- **Ground Truth Answer**: The expected correct response, usually a structured set of instructions with bullet points or numbered steps.
- **Answer**: The AI-generated response that needs to be evaluated.

### **Scoring Criteria:**
- **0 (Incorrect Answer)**: The **Answer** is mostly or entirely incorrect. It may be unrelated to the **Question**, reference the wrong instructions, or address a different process.
- **1 (Partially Correct)**: The **Answer** contains some correct points but is incomplete or includes incorrect elements. It may also be **completely unstructured**, making it difficult to follow.
- **2 (Mostly Correct but Unstructured)**: The **Answer** is factually correct and includes all required points from the **Ground Truth Answer**, but it is not formatted properly (e.g., lacks bullet points, numbering, or section headers).
- **3 (Fully Correct and Well-Structured)**: The **Answer** is entirely correct, contains all required points, and follows the same structured format as the **Ground Truth Answer** (e.g., numbered steps, bullet points, section headers).

### **Evaluation Steps:**
1. Compare the **content accuracy** of the **Answer** to the **Ground Truth Answer**.
2. Check whether all key points from the **Ground Truth Answer** are included in the **Answer**.
3. Assess the **structure and formatting** (e.g., bullet points, numbered lists, sections).
4. Assign a score from 0 to 3 based on the above criteria.

### **Output Format:**
Provide your evaluation as follows:
- **Score:** [0-3]
"""
# todo: add support for returning reasoning
# - **Reasoning:** Explain why you assigned this score, highlighting missing, incorrect, or unstructured elements.

system_prompt_groundedness = """
You are an expert evaluator of technical documentation for medical devices. Your task is to assess whether an AI-generated response groundedness information—that is, whether it introduces content not supported by the provided context. Assign a score from 0 to 2 based on the criteria below.

### Input Variables:
- **Relevant Context**: Retrieved content from the documentation database that the AI had access to when generating its response.
- **Answer**: The AI-generated response to be evaluated for hallucination.

### Scoring Criteria:
- **0 — Hallucinated Answer**: The **Answer** is not based on the **Relevant Context** and includes fabricated or unsupported content.
- **1 — Partially Grounded**: The **Answer** is partially based on the **Relevant Context**, but includes additional details, reworded instructions, or differs in structure.
- **2 — Fully Grounded**: The **Answer** is entirely consistent with the **Relevant Context**, both in content and structure.

### Special Cases:
- If no **Relevant Context** is provided and the AI clearly states that it cannot answer due to a lack of information, score **2**.
- If no **Relevant Context** is provided and the AI still attempts to answer, score **0**.

### **Output Format:**
Provide your evaluation as follows:
- **Score:** [0-2]
"""

system_prompt_relevant_context_selection = """
You are an expert evaluator of technical documentation for medical devices. Your task is to assess whether an AI system selected the correct relevant context from a set of retrieved documentation chunks to answer a given question.
You must assign a score from 0 to 2 based on the criteria below.

### Input Variables:
- **Retrieved Context Chunks**: All chunks retrieved from the documentation database for the current question.
- **Correct Context Chunks**: The chunks known to contain the information required to correctly answer the question.
- **Relevant Context Selected**: The subset of **Retrieved Context Chunks** that the AI selected as relevant for answering the question.

### Scoring Criteria:
- **Score 0**:
  - The **Retrieved Context Chunks** include the **Correct Context Chunks**, but the AI selected **only irrelevant chunks**.
  - Or: The **Correct Context Chunks** are **not** in the retrieved set, but the AI still selected unrelated chunks as relevant.

- **Score 1**:
  - The AI selected a mix of correct and incorrect context chunks. That is, **some** of the **Correct Context Chunks** were selected, but not all.

- **Score 2**:
  - The AI selected **all** and **only** the **Correct Context Chunks** from the retrieved set.
  - Or: The **Correct Context Chunks** were **not** present in the **Retrieved Context Chunks**, and the AI correctly selected **nothing** as relevant.

### **Output Format:**
Provide your evaluation as follows:
- **Score:** [0-2]
"""

user_prompt_relevant_context_selection = """
**Retrieved Context Chunks**:
{retrieved_context}


**Correct Context Chunks**:
{correct_context}


**Relevant Context Selected**:
{relevant_context}
"""

user_prompt_groundedness = """
Relevant Context: {context}

Answer: {answer}
"""

user_prompt_answer_relevance = """
Question: {question}

Ground Truth Answer: {ground_truth_answer}


Answer: {answer}
"""

In [90]:
class StandAlone(Provider):
    class DocMeta(BaseModel):
        """Doc metadata used for matching AI responses with test docs."""

        pdf_title: str = Field(description="PDF title.")
        page: int = Field(description="Page number.")
        device: DeviceEnum = Field(description="Device type.")
        content: Optional[str] = Field(default=None)

        def str_preview(self, index: int) -> str:
            return f"### Doc ({index}) ### \npdf title: {self.pdf_title} \npage: {self.page}"

        def __hash__(self):
            return hash((self.pdf_title, self.page, self.device))

        def __eq__(self, other):
            return (
                self.pdf_title == other.pdf_title
                and self.page == other.page
                and self.device == other.device
            )

    class EvaluatorLLMSchema(BaseModel):
        score: float = Field(default=None, description="The metric score of answer.")
        # reasoning: str = Field(description="The reasoning of score.")  # todo: add support for returning reasoning

    answer_relevance_llm: Any
    retrieve_func: Any

    def __init__(
        self,
        answer_relevance_llm: Runnable,
        retrieve_func: Callable[[str, int, str], list[Document]],
    ):
        super().__init__()
        self.answer_relevance_llm = answer_relevance_llm
        self.retrieve_func = retrieve_func

    @staticmethod
    def device_classification(response: dict) -> float:
        state = AgentQueryState(**response)
        actual_device = state.state_kwargs["device"]
        device = state.device_classification.device
        return actual_device == device

    def relevant_context_selection(self, response: dict) -> float:
        """Process retrieved, selected and actual docs"""
        state = AgentQueryState(**response)
        actual_doc_meta = self._get_actual_doc_meta(state.state_kwargs)
        response_doc_meta = self._get_docs_meta(state)
        relevant_selected = state.filtered_relevant_documents.relevant_sources

        actual_doc_str = "\n\n".join(
            [doc.str_preview(i) for i, doc in enumerate(actual_doc_meta, start=1)]
        )
        response_doc_str = "\n\n".join(
            [doc.str_preview(i) for i, doc in enumerate(response_doc_meta, start=1)]
        )

        relevant_selected_list = []
        for relevant_selected_doc in relevant_selected:
            title = relevant_selected_doc.title
            pages = ", ".join(map(str, relevant_selected_doc.pages))
            relevant_selected_list.append(f"pdf title: {title} \npages: {pages}")

        relevant_selected_str = "\n\n".join(relevant_selected_list)

        user_prompt_formatted = user_prompt_relevant_context_selection.format(
            retrieved_context=response_doc_str,
            correct_context=actual_doc_str,
            relevant_context=relevant_selected_str,
        )
        llm_inputs = [
            {"role": "system", "content": system_prompt_relevant_context_selection},
            {"role": "user", "content": user_prompt_formatted},
        ]
        score = self.answer_relevance_llm.invoke(llm_inputs).score
        return self._normalize_score(score, 2)

    def context_recall(self, response: dict) -> float:
        state = AgentQueryState(**response)
        response_doc_meta = self._get_docs_meta(state.retrieved_documents)
        relevant_doc_meta = self._get_relevant_docs_meta(state)
        actual_doc_meta = self._get_actual_doc_meta(state.state_kwargs)

        positives = response_doc_meta & actual_doc_meta
        if not positives:
            return 1

        true_positives = len(positives & relevant_doc_meta)
        return true_positives / len(positives)

    def context_precision(self, response: dict) -> float:
        state = AgentQueryState(**response)
        response_doc_meta = self._get_docs_meta(state.retrieved_documents)
        relevant_doc_meta = self._get_relevant_docs_meta(state)
        actual_doc_meta = self._get_actual_doc_meta(state.state_kwargs)

        positives = response_doc_meta & actual_doc_meta
        true_positives = len(positives & relevant_doc_meta) or 1

        negatives = response_doc_meta - actual_doc_meta
        false_positives = len(negatives & relevant_doc_meta)

        return true_positives / (true_positives + false_positives)

    def groundedness(self, response: dict) -> float:
        state = AgentQueryState(**response)
        docs_metadata = self._get_docs_meta(state.retrieved_documents)
        actual_docs_metadata = self._get_actual_doc_meta(state.state_kwargs)

        relevant_content = "\n".join(
            [
                doc_meta.content
                for doc_meta in docs_metadata
                if doc_meta in actual_docs_metadata
            ]
        )
        relevant_content = relevant_content or "None"

        user_prompt_formatted = user_prompt_groundedness.format(
            context=relevant_content, answer=state.final_response
        )

        llm_inputs = [
            {"role": "system", "content": system_prompt_groundedness},
            {"role": "user", "content": user_prompt_formatted},
        ]
        score = self.answer_relevance_llm.invoke(llm_inputs).score
        return self._normalize_score(score, 2)

    def answer_relevance(self, response: dict) -> float:
        state = AgentQueryState(**response)
        user_prompt_formatted = user_prompt_answer_relevance.format(
            question=state.paraphrased_question,
            ground_truth_answer=state.state_kwargs["ground_truth"],
            answer=state.final_response,
        )

        llm_inputs = [
            {"role": "system", "content": system_prompt_answer_relevance},
            {"role": "user", "content": user_prompt_formatted},
        ]

        score = self.answer_relevance_llm.invoke(llm_inputs).score
        return self._normalize_score(score, 3)

    def vector_store_recall(self, response, k) -> float:
        state = AgentQueryState(**response)
        actual_doc_meta = self._get_actual_doc_meta(state.state_kwargs)
        k = max(k, len(actual_doc_meta))
        query = state.paraphrased_question
        device = state.device_classification.device.value
        docs = self.retrieve_func(query, k, device)
        docs_meta = self._get_docs_meta(docs)
        true_positives = docs_meta & actual_doc_meta
        return len(true_positives) / len(actual_doc_meta)

    def _get_relevant_docs_meta(self, state: AgentQueryState) -> set[DocMeta]:
        device = state.device_classification.device
        docs_metadata = set()
        for doc in state.filtered_relevant_documents.relevant_sources:
            for doc_page in doc.pages:
                doc_meta = self.DocMeta(
                    pdf_title=doc.title,
                    page=int(doc_page),
                    device=device,
                )
                docs_metadata.add(doc_meta)
        return docs_metadata

    def _get_docs_meta(self, context_docs: list[Document]) -> set[DocMeta]:
        # context_docs = [doc for doc in state.retrieved_documents]
        docs_metadata = set()
        for doc in context_docs:
            dm = doc.metadata
            doc_meta = self.DocMeta(
                pdf_title=dm["pdf_title"],
                page=int(dm["page"]),
                device=dm["device"],
                content=doc.page_content,
            )
            docs_metadata.add(doc_meta)
        return docs_metadata

    def _get_actual_doc_meta(self, state_kwargs: dict) -> set[DocMeta]:
        actual_docs_metadata = set()
        for doc in state_kwargs["docs"]:
            for doc_page in doc["pages"]:
                device_title_page = self.DocMeta(
                    pdf_title=doc["pdf_title"],
                    page=int(doc_page),
                    device=state_kwargs["device"],
                )
                actual_docs_metadata.add(device_title_page)
        return actual_docs_metadata

    @staticmethod
    def _normalize_score(score, max_score):
        return min(1, max(0, score / max_score))

In [91]:
eval_llm = init_chat_model(model=GROQ_LLAMA_SCOUT_17B, temperature=0.0)
eval_llm_structured = eval_llm.with_structured_output(StandAlone.EvaluatorLLMSchema)

In [92]:
# ## Test Answer Relevance Metric ###
#
# user_prompt_formatted = user_prompt_answer_relevance.format(
#     question=test[0]["question"],
#     ground_truth_answer=test[0]["ground_truth"],
#     answer="To replace the battery on the Lifepak 15, first confirm that the new battery is fully charged. Then, inspect the battery pins and contacts in the battery wells for signs of damage. Next, align the new battery so the battery clip is over the pins in the battery well, insert the end of the battery opposite the clip into the well, and press the clip end into the well until it clicks into place. It is recommended to replace batteries approximately every two years or when they show signs of damage or reduced capacity."
# )
#
#
# llm_inputs = [
#     {"role": "system", "content": system_prompt_answer_relevance},
#     {"role": "user", "content": user_prompt_formatted},
# ]
#
# res = eval_llm_structured.invoke(llm_inputs)
# res

In [94]:
def form_feedbacks(provider: StandAlone):
    feedbacks = [
        Feedback(provider.device_classification, name="Device Classification").on(
            Select.RecordCalls.run.rets
        ),
        Feedback(provider.context_recall, name="Context Recall").on(
            Select.RecordCalls.run.rets
        ),
        Feedback(provider.context_precision, name="Context Precision").on(
            Select.RecordCalls.run.rets
        ),
        Feedback(provider.answer_relevance, name="Answer Relevance").on(
            Select.RecordCalls.run.rets
        ),
        Feedback(provider.groundedness, name="Groundedness").on(
            Select.RecordCalls.run.rets
        ),
        Feedback(provider.relevant_context_selection, name="Context Selected").on(
            Select.RecordCalls.run.rets
        ),
        Feedback(provider.vector_store_recall, name="VStorage Recall k=2").on(
            response=Select.RecordCalls.run.rets,
            k=Select.RecordCalls.run.args.kwargs.vector_storage_k_test.k2,
        ),
        Feedback(provider.vector_store_recall, name="VStorage Recall k=10").on(
            response=Select.RecordCalls.run.rets,
            k=Select.RecordCalls.run.args.kwargs.vector_storage_k_test.k10,
        ),
        Feedback(provider.vector_store_recall, name="VStorage Recall k=25").on(
            response=Select.RecordCalls.run.rets,
            k=Select.RecordCalls.run.args.kwargs.vector_storage_k_test.k25,
        ),
    ]
    return feedbacks

In [95]:
def evaluate_agent(agent: MedTechAgent, eval_recorder_kwargs: dict):
    tru_recorder = TruApp(agent, **eval_recorder_kwargs)

    for sample in tqdm(test):
        inputs = {
            "question": sample["question"],
            "ground_truth": sample["ground_truth"],
            "device": sample["device"],
            "docs": sample["docs"],
            "vector_storage_k_test": {
                "k2": 2,
                "k10": 10,
                "k25": 25,
            },
        }

        ## Respect API rate limits
        # start = time.time()

        with tru_recorder as _:
            _ = agent.run(**inputs)

        # time_sleep = 30 - (time.time() - start)
        # if time_sleep > 0:
        #     time.sleep(time_sleep)

In [96]:
plumber_feedbacks_provider = StandAlone(eval_llm_structured, plumber_med_agent.retrieve)
docling_feedbacks_provider = StandAlone(eval_llm_structured, docling_med_agent.retrieve)

plumber_recorder_kwargs = {
    "app_name": "RAG App Plumber",
    "feedbacks": form_feedbacks(plumber_feedbacks_provider),
}
docling_recorder_kwargs = {
    "app_name": "RAG App Docling",
    "feedbacks": form_feedbacks(docling_feedbacks_provider),
}

✅ In Device Classification, input response will be set to __record__.app.run.rets .
✅ In Context Recall, input response will be set to __record__.app.run.rets .
✅ In Context Precision, input response will be set to __record__.app.run.rets .
✅ In Answer Relevance, input response will be set to __record__.app.run.rets .
✅ In Groundedness, input response will be set to __record__.app.run.rets .
✅ In Context Selected, input response will be set to __record__.app.run.rets .
✅ In Device Classification, input response will be set to __record__.app.run.rets .
✅ In Context Recall, input response will be set to __record__.app.run.rets .
✅ In Context Precision, input response will be set to __record__.app.run.rets .
✅ In Answer Relevance, input response will be set to __record__.app.run.rets .
✅ In Groundedness, input response will be set to __record__.app.run.rets .
✅ In Context Selected, input response will be set to __record__.app.run.rets .


In [97]:
evaluate_agent(plumber_med_agent, plumber_recorder_kwargs)

instrumenting <class 'ragapp.rag.med_agent_graph.MedTechAgent'> for base <class 'ragapp.rag.med_agent_graph.MedTechAgent'>
	instrumenting run


100%|██████████| 4/4 [01:42<00:00, 25.57s/it]


In [99]:
evaluate_agent(docling_med_agent, docling_recorder_kwargs)

instrumenting <class 'ragapp.rag.med_agent_graph.MedTechAgent'> for base <class 'ragapp.rag.med_agent_graph.MedTechAgent'>
	instrumenting run


100%|██████████| 4/4 [01:33<00:00, 23.25s/it]


In [100]:
session.get_leaderboard()

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer Relevance,Context Precision,Context Recall,Device Classification,Groundedness,VStorage Recall k=10,VStorage Recall k=2,VStorage Recall k=25,latency,total_cost
app_name,app_version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
RAG App Docling,base,0.75,0.354167,1.0,1.0,0.75,0.625,0.625,0.75,22.998292,0.0
RAG App Plumber,base,0.5,0.333333,1.0,1.0,0.75,0.5,0.5,0.5,25.315782,0.0


In [25]:
run_dashboard(session)

Starting dashboard ...


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://localhost:56629 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>