In [2]:
import json
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import Any

from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chat_models import init_chat_model

from ragapp.rag.med_agent_graph import MedTechAgent, DeviceEnum
from constants import (
    TOGETHER_META_LLAMA_70B_FREE,
    GROUND_TRUTH_ACTUAL_JSON,
    EMBEDDINGS_MODEL_NAME,
    INDEX_CACHE_DIR,
)

In [3]:
load_dotenv()

True

In [4]:
with open(GROUND_TRUTH_ACTUAL_JSON, "r", encoding="utf-8") as file:
    ground_truth_actual_data = json.load(file)

In [5]:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)
vector_store = Chroma(
    collection_name="LIFEPAK_index",
    embedding_function=embeddings,
    persist_directory=INDEX_CACHE_DIR,
)

In [11]:
def explore_sample(sample, device=None, search_pattern: str = None):
    print("Question:", sample["question"], "\n\n")
    print("Ground Truth:\n", sample["ground_truth"], "\n\n\n")

    get_res = vector_store.get(
        where_document={"$contains": search_pattern}, where={"device": device}
    )

    n_docs = len(get_res["ids"])
    print("#" * 20, f"DOCUMENTS FOUND ({n_docs})", "#" * 20)
    print(f"\t search pattern: '{search_pattern}'")
    print("\t device filter:", device)
    for i, doc in enumerate(get_res["documents"], start=1):
        print("#" * 20, f"Doc {i}", "#" * 20)
        print(doc)

    print("#" * 20, "END DOCUMENTS:", "#" * 20)

    start_point = sample["ground_truth"].rfind(search_pattern)

    print("\n\n\nSEARCH PATTERN EDGES IN GROUND TRUTH:")
    print(f"\t Pattern edges: {start_point}:{start_point + len(search_pattern)}")
    print("\nDOCS: [ pdf title | page ]")
    for i, doc_metadata in enumerate(get_res["metadatas"], start=1):
        print("\t", doc_metadata["pdf_title"], "|", doc_metadata["page"])

In [13]:
# eval_ids = [0, 1, 10, 11, 8]

explore_sample(
    ground_truth_actual_data[10],
    device=DeviceEnum.lifepak_20,
    search_pattern="speaker grill holes",
)

Question: How do I troubleshoot low volume on Lifepak 20? 


Ground Truth:
 Possible Cause and Corrective Action 
A Possible cause of low speaker volume is moisture in speaker grill holes. To resolve, wipe moisture from speaker grill and allow device to dry. 



#################### DOCUMENTS FOUND (2) ####################
	 search pattern: 'speaker grill holes'
	 device filter: DeviceEnum.lifepak_20
#################### Doc 1 ####################
Maintaining the Equipment
GENERAL TROUBLESHOOTING TIPS
If a problem with the defibrillator/monitor is detected during operation or testing, refer to the
troubleshooting tips in Table7-2. If the problem cannot be corrected, remove the defibrillator/monitor
from use and contact qualified service personnel.
Table7-2 General Troubleshooting Tips
Observation Possible Cause Corrective Action
1 No power when Low battery voltage. (cid:129) Connect to AC power.
defibrillator/monitor is
turned ON.
2 Defibrillator/monitor Operating temperature is too (c

In [92]:
test = [
    {
        "question": ground_truth_actual_data[0]["question"],
        "ground_truth": ground_truth_actual_data[0]["ground_truth"],
        "search_pattern": ground_truth_actual_data[0]["ground_truth"][26:118],
        "device": DeviceEnum.lifepak_15,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_LIFEPAK_15_Monitor_Defibrillator_3314911_036_202105.pdf",
                "pages": [37],
            }
        ],
    },
    {
        "question": ground_truth_actual_data[1]["question"],
        "ground_truth": ground_truth_actual_data[1]["ground_truth"],
        "search_pattern": ground_truth_actual_data[1]["ground_truth"][34:57],
        "device": DeviceEnum.lifepak_15,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_LIFEPAK_15_Monitor_Defibrillator_3314911_036_202105.pdf",
                "pages": [90],
            }
        ],
    },
    {
        "question": ground_truth_actual_data[10]["question"],
        "ground_truth": ground_truth_actual_data[10]["ground_truth"],
        "search_pattern": ground_truth_actual_data[10]["ground_truth"][89:111],
        "device": DeviceEnum.lifepak_20,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3200750_039_201404_eq00.pdf",
                "pages": [110],
            },
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3313180_008_201508.pdf",
                "pages": [123],
            },
        ],
    },
    {
        "question": ground_truth_actual_data[11]["question"],
        "ground_truth": ground_truth_actual_data[11]["ground_truth"],
        "search_pattern": ground_truth_actual_data[11]["ground_truth"][239:267],
        "device": DeviceEnum.lifepak_20,
        "docs": [
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3200750_039_201404_eq00.pdf",
                "pages": [65],
            },
            {
                "pdf_title": "Stryker_Physio_Control_Lifepak_20_Defibrillator_Monitor_3313180_008_201508.pdf",
                "pages": [74],
            },
        ],
    },
]

In [106]:
for t in test:
    for d in t["docs"]:
        for page in d["pages"]:
            where_filter = {
                "$and": [
                    {"device": t["device"].value},
                    {"pdf_title": d["pdf_title"]},
                    {"page": page},
                ]
            }
            res = vector_store.get(
                where_document={"$contains": t["search_pattern"]},
                where=where_filter,
            )
            assert len(res["ids"]) == 1

In [8]:
chat_mode_name = TOGETHER_META_LLAMA_70B_FREE

graph_agent = MedTechAgent(
    vector_store=vector_store,
    rag_model_name=chat_mode_name,
    device_model_name=chat_mode_name,
)

In [9]:
from trulens.core import TruSession, Select, Feedback, Provider
from trulens.core.instruments import instrument
from trulens.apps.app import TruApp

from trulens.dashboard.run import run_dashboard

In [176]:
session = TruSession()
session.reset_database()

Updating app_name and app_version in apps table: 0it [00:00, ?it/s]
Updating app_id in records table: 0it [00:00, ?it/s]
Updating app_json in apps table: 0it [00:00, ?it/s]


In [177]:
instrument.method(MedTechAgent, "run")

In [178]:
system_prompt_answer_relevance = """
You are an expert evaluator for medical device technical documentation. Your task is to assess the accuracy and completeness of an AI-generated response compared to the provided ground truth answer. You will assign a score from 0 to 3 based on the given criteria.

### **Input Variables:**
- **Question**: The user's query asking about a medical device's technical details or instructions.
- **Ground Truth Answer**: The expected correct response, usually a structured set of instructions with bullet points or numbered steps.
- **Answer**: The AI-generated response that needs to be evaluated.

### **Scoring Criteria:**
- **0 (Incorrect Answer)**: The **Answer** is mostly or entirely incorrect. It may be unrelated to the **Question**, reference the wrong instructions, or address a different process.
- **1 (Partially Correct)**: The **Answer** contains some correct points but is incomplete or includes incorrect elements. It may also be **completely unstructured**, making it difficult to follow.
- **2 (Mostly Correct but Unstructured)**: The **Answer** is factually correct and includes all required points from the **Ground Truth Answer**, but it is not formatted properly (e.g., lacks bullet points, numbering, or section headers).
- **3 (Fully Correct and Well-Structured)**: The **Answer** is entirely correct, contains all required points, and follows the same structured format as the **Ground Truth Answer** (e.g., numbered steps, bullet points, section headers).

### **Evaluation Steps:**
1. Compare the **content accuracy** of the **Answer** to the **Ground Truth Answer**.
2. Check whether all key points from the **Ground Truth Answer** are included in the **Answer**.
3. Assess the **structure and formatting** (e.g., bullet points, numbered lists, sections).
4. Assign a score from 0 to 3 based on the above criteria.

### **Output Format:**
Provide your evaluation as follows:
- **Score:** [0-3]
"""
# - **Reasoning:** Explain why you assigned this score, highlighting missing, incorrect, or unstructured elements.

user_prompt_answer_relevance = """
Question: {question}

Ground Truth Answer: {ground_truth_answer}


Answer: {answer}
"""


class StandAlone(Provider):
    class AnswerRelevanceSchema(BaseModel):
        score: float = Field(default=None, description="The metric score of answer.")

    answer_relevance_llm: Any = Field(
        default=None, description="The answer relevance LLM."
    )

    def __init__(self, answer_relevance_llm):
        super().__init__()
        self.answer_relevance_llm = answer_relevance_llm

    @staticmethod
    def context_provided(response: dict) -> float:
        docs_metadata_provided = [
            doc["metadata"] for doc in response["context_documents"]
        ]
        docs_device_title_page = set(
            [
                (doc["device"], doc["pdf_title"], doc["page"])
                for doc in docs_metadata_provided
            ]
        )

        state_kwargs = response["state_kwargs"]
        actual_device_title_page = set()
        for doc in state_kwargs["docs"]:
            for p in doc["pages"]:
                device_title_page = (state_kwargs["device"], doc["pdf_title"], p)
                actual_device_title_page.add(device_title_page)

        doc_contains = any(
            [data in actual_device_title_page for data in docs_device_title_page]
        )
        return doc_contains

    def answer_relevance(self, response: dict) -> float:
        user_prompt_formatted = user_prompt_answer_relevance.format(
            question=response["question"],
            ground_truth_answer=response["state_kwargs"]["ground_truth"],
            answer=response["retrieval_result"]["answer"],
        )

        llm_inputs = [
            {"role": "system", "content": system_prompt_answer_relevance},
            {"role": "user", "content": user_prompt_formatted},
        ]

        score = self.answer_relevance_llm.invoke(llm_inputs).score
        return score / 3

In [179]:
eval_llm = init_chat_model(model=TOGETHER_META_LLAMA_70B_FREE, temperature=0.0)
eval_llm_structured = eval_llm.with_structured_output(StandAlone.AnswerRelevanceSchema)

In [185]:
# user_prompt_formatted = user_prompt_answer_relevance.format(
#     question=test[0]["question"],
#     ground_truth_answer=test[0]["ground_truth"],
#     answer="To replace the battery on the Lifepak 15, first confirm that the new battery is fully charged. Then, inspect the battery pins and contacts in the battery wells for signs of damage. Next, align the new battery so the battery clip is over the pins in the battery well, insert the end of the battery opposite the clip into the well, and press the clip end into the well until it clicks into place. It is recommended to replace batteries approximately every two years or when they show signs of damage or reduced capacity."
# )
#
#
# llm_inputs = [
#     {"role": "system", "content": system_prompt_answer_relevance},
#     {"role": "user", "content": user_prompt_formatted},
# ]

# res = eval_llm_structured.invoke(llm_inputs)
# res

In [181]:
standalone = StandAlone(eval_llm_structured)

f_context_relevance = Feedback(
    standalone.context_provided, name="Context Relevance"
).on(Select.RecordCalls.run.rets)

f_answer_relevance = Feedback(standalone.answer_relevance, name="Answer Relevance").on(
    Select.RecordCalls.run.rets
)

✅ In Context Relevance, input response will be set to __record__.app.run.rets .
✅ In Answer Relevance, input response will be set to __record__.app.run.rets .


In [182]:
import time

recorder_kwargs = {
    "app_name": "RAG App",
    "app_version": "0.1.0",
    "feedbacks": [f_context_relevance, f_answer_relevance],
}

tru_recorder = TruApp(graph_agent, **recorder_kwargs)
inputs = {"question": "How do I replace the battery on the Lifepak20", "k": 3}

for t in test:
    start = time.time()
    inputs = {
        "question": t["question"],
        "k": 3,
        "ground_truth": t["ground_truth"],
        "device": t["device"],
        "docs": t["docs"],
    }
    with tru_recorder as recording:
        res = graph_agent.run(**inputs)

    time_work = time.time() - start
    time_sleep = 30 - time_work
    if time_sleep > 0:
        time.sleep(time_sleep)

instrumenting <class 'ragapp.rag.med_agent_graph.MedTechAgent'> for base <class 'ragapp.rag.med_agent_graph.MedTechAgent'>
	instrumenting run
	instrumenting run
	instrumenting run


In [184]:
session.get_leaderboard()

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer Relevance,Context Relevance,latency,total_cost
app_name,app_version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RAG App,0.1.0,0.583333,0.5,11.992667,0.0
