Lets do few shot inference against our EHR FHIR PostGre SQL database



1.   Load and test Open AI access
2.   Load and test EHR PostgreSQL database access
3.   Load FAISS









# 0. Mount drive and define paths

# Installs

In [None]:
%%capture
# Clean Install/Upgrade for ALL Core LLM/QLoRA Libraries
# -U ensures the latest versions (best compatibility)
# -q suppresses the long output
!pip install -U -q accelerate peft transformers trl bitsandbytes sentencepiece

In [None]:
%%capture
# --- 1) Install libraries (once per runtime) ---
!pip -q install --upgrade openai langchain langchain-community langchain-openai faiss-cpu

In [None]:
# --- 0) Mount Drive and set your variables ---
from google.colab import drive
drive.mount('/content/drive')

DEV_PATH = "/content/drive/MyDrive/210_Capstone/210_Factory/210_dev"
FAISS_DB_PATH = DEV_PATH + "/vectorstores/medintellagent_faiss_v1"
POSTGRES_DB_PATH = DEV_PATH + "/synthea_ehr_backup.sql"  # (dump file; not used in this snippet)
LLM_MODEL = "gpt-4o-mini"
EMBEDDING_MODEL = "text-embedding-3-large"



Mounted at /content/drive


In [None]:
# --- 2) Load your OpenAI key (recommended: Colab "Secrets" → OPENAI_API_KEY) ---
import os
try:
    from google.colab import userdata
    key = userdata.get('OPENAI_API_KEY')
    if key: os.environ['OPENAI_API_KEY'] = key
except Exception:
    pass

if not os.environ.get("OPENAI_API_KEY"):
    import getpass
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OPENAI_API_KEY: ")

# Initialize OpenAI client
from openai import OpenAI
client = OpenAI()


In [None]:
# --- 3) Load  FAISS vector store (must use same embedding model used to build it) ---
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
vs = FAISS.load_local(FAISS_DB_PATH, embeddings, allow_dangerous_deserialization=True)

print("FAISS loaded. Example count:", len(vs.docstore._dict))


FAISS loaded. Example count: 90


In [None]:
# --- Prompt building helpers ---

# Keep this aligned to your actual DB schema + rules
PREFIX = (
    "Return a single PostgreSQL SELECT only.\n"
    "Use only tables: patients, encounters, conditions, observations, medication_requests, procedures.\n"
    "Use only parameter :patient_id. Prefer DISTINCT ON with ORDER BY for 'latest per X'; no CTEs or window functions.\n"
    "Do not mix GROUP BY with DISTINCT ON. If aggregation is needed (e.g., pairing BP), use GROUP BY + MAX(CASE...).\n"
    "Important: medication_requests has no rxnorm_code (use med_name only). Encounters has no location.\n"
    "If the question mentions 'blood pressure' or 'BP', return only systolic (8480-6) and diastolic (8462-4) results and prefer paired rows grouped by effective_datetime.\n"
    "\n"
    "Schema hints:\n"
    "  conditions(display, code, onset_datetime, abatement_datetime, encounter_id, patient_id, condition_id)\n"
    "  observations(display, loinc_code, value_num, value_unit, effective_datetime, encounter_id, patient_id, observation_id)\n"
    "  medication_requests(med_name, dose, route, start_datetime, end_datetime, refills, encounter_id, patient_id, med_request_id)\n"
    "  encounters(start_datetime, end_datetime, reason_text, class, encounter_id, patient_id)\n"
    "  procedures(display, code, performed_datetime, encounter_id, patient_id, procedure_id)\n"
    "Output only the raw SQL, no markdown fences."
)

def get_few_shots(query: str, k: int = 3):
    # uses your already-loaded FAISS vector store: `vs`
    docs_scores = vs.similarity_search_with_score(query, k=k)
    examples = [{"question": d.page_content, "sql": (d.metadata or {}).get("sql","")} for d, _ in docs_scores]
    return examples

def format_examples(examples):
    return "\n".join([f"Question: {ex['question']}\nSQL:\n{ex['sql']}\n" for ex in examples])

def build_prompt(user_question: str, k: int = 3) -> str:
    examples = get_few_shots(user_question, k=k)
    return f"{PREFIX}\n{format_examples(examples)}\nQuestion: {user_question}\nSQL:"


In [None]:
# --- 5) SQL generation + a tiny safety check ---
import re

SELECT_ONLY = re.compile(r"^\s*select\b", re.IGNORECASE | re.DOTALL)

def clean_sql(text: str) -> str:
    s = text.strip()

    # strip a leading "SQL:" line if present
    if s.lower().startswith("sql:"):
        s = s[4:].strip()

    # strip fenced code blocks like ```sql ... ``` or ``` ... ```
    m = re.match(r"^```(?:\s*sql)?\s*([\s\S]*?)\s*```$", s, flags=re.IGNORECASE)
    if m:
        s = m.group(1).strip()

    # strip stray backticks if the model emitted them oddly
    if s.startswith("```") and "```" in s[3:]:
        s = s.split("```", 1)[1].rsplit("```", 1)[0].strip()

    # remove BOM or weird invisibles
    s = s.replace("\ufeff", "").replace("\u200b", "").strip()
    return s

def is_safe_select(text: str) -> bool:
    sql = clean_sql(text)

    # (optional) reject multiple statements (allow a single trailing semicolon)
    trimmed = sql.strip()
    if ";" in trimmed[:-1]:  # semicolon before the last char
        return False

    if not SELECT_ONLY.match(trimmed):
        return False

    banned = (" insert ", " update ", " delete ", " drop ", " alter ",
              " create ", " grant ", " revoke ", " truncate ")
    low = f" {trimmed.lower()} "  # pad with spaces to avoid substring accidents
    return not any(b in low for b in banned)

def generate_sql(user_question: str, k: int = 3, max_tokens: int = 400):
    prompt = build_prompt(user_question, k=k)
    resp = client.chat.completions.create(
        model=LLM_MODEL,
        temperature=0,
        messages=[
            {"role":"system","content":"You are a precise SQL generator for a patient portal."},
            {"role":"user","content": prompt}
        ],
        max_tokens=max_tokens,
    )
    sql = resp.choices[0].message.content.strip()
    return sql

# Demo
demo_q = "Which medications am I currently taking?"
sql = generate_sql(demo_q, k=3)
print(sql, "\n\nSAFE:", is_safe_select(sql))


SELECT DISTINCT ON (mr.patient_id, mr.med_name)
  mr.patient_id,
  mr.med_name AS medication,
  mr.dose,
  mr.route,
  mr.start_datetime,
  mr.end_datetime,
  mr.refills
FROM medication_requests mr
WHERE mr.patient_id = :patient_id
  AND (mr.end_datetime IS NULL OR mr.end_datetime >= NOW())
ORDER BY mr.patient_id,
         mr.med_name,
         COALESCE(mr.end_datetime, mr.start_datetime) DESC NULLS LAST; 

SAFE: True


# Load PostgreSQL EHR FHIR Database

In [None]:
%%capture
!apt-get -y update
!apt-get -y install postgresql postgresql-contrib

!service postgresql start
!sudo -u postgres psql -c "ALTER USER postgres PASSWORD 'postgres';"
!sudo -u postgres createdb synthea_ehr

!echo "PostgreSQL installed, service started, user password set to 'postgres', and DB 'synthea_ehr' created."


In [None]:
import subprocess
import os

# Database connection details
DB_NAME = "synthea_ehr"
DB_USER = "postgres"
DB_PASSWORD = "postgres"
DB_HOST = "localhost"

# Path on Google Drive to the backup file
BACKUP_PATH = DEV_PATH + "/synthea_ehr_backup.sql"

def restore_database():
    """Restores the synthea_ehr database from a file on Google Drive."""
    try:
        print("Starting database restore...")

        # First, drop and re-create the database to ensure a clean state
        print("Dropping and re-creating the database for a clean restore...")
        env = os.environ.copy()
        env['PGPASSWORD'] = DB_PASSWORD

        # Command to drop the database
        drop_command = [
            'dropdb',
            '--host', DB_HOST,
            '--username', DB_USER,
            DB_NAME
        ]
        # This will fail if the DB doesn't exist, so we don't check for errors
        subprocess.run(drop_command, env=env, check=False, capture_output=True, text=True)

        # Command to create the database
        create_command = [
            'createdb',
            '--host', DB_HOST,
            '--username', DB_USER,
            DB_NAME
        ]
        subprocess.run(create_command, env=env, check=True, capture_output=True, text=True)
        print("Database re-created successfully.")

        # Use subprocess to run the psql command to restore the backup
        command = [
            'psql',
            '--host', DB_HOST,
            '--username', DB_USER,
            '--dbname', DB_NAME,
            '--file', BACKUP_PATH
        ]

        process = subprocess.run(command, env=env, check=True, capture_output=True, text=True)
        print("Database restore successful!")

    except FileNotFoundError:
        print("Error: psql or dropdb/createdb commands not found. Please ensure PostgreSQL client tools are installed.")
        print("You can try running: !apt-get update && !apt-get install -y postgresql-client")
    except subprocess.CalledProcessError as e:
        print("Error during restore process.")
        print(f"Subprocess returned non-zero exit code: {e.returncode}")
        print(f"STDOUT:\n{e.stdout}")
        print(f"STDERR:\n{e.stderr}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

restore_database()

Starting database restore...
Dropping and re-creating the database for a clean restore...
Database re-created successfully.
Database restore successful!


In [None]:
# If needed once per runtime:
# !pip -q install psycopg2-binary

import re
import psycopg2
import psycopg2.extras

# Uses your existing globals:
# DB_NAME = "synthea_ehr"
# DB_USER = "postgres"
# DB_PASSWORD = "postgres"
# DB_HOST = "localhost"

# --- helpers ---
_SELECT_ONLY = re.compile(r"^\s*select\b", re.IGNORECASE | re.DOTALL)
_BANNED = (" insert ", " update ", " delete ", " drop ", " alter ",
           " create ", " grant ", " revoke ", " truncate ", " copy ", " do ")

# match :name (not preceded by another :)
_PARAM = re.compile(r'(?<!:):([a-zA-Z_]\w*)')

def _clean_sql(text: str) -> str:
    """Remove code fences / labels and invisible chars."""
    s = (text or "").strip()
    if s.lower().startswith("sql:"):
        s = s[4:].strip()
    m = re.match(r"^```(?:\s*sql)?\s*([\s\S]*?)\s*```$", s, flags=re.IGNORECASE)
    if m:
        s = m.group(1).strip()
    return s.replace("\ufeff","").replace("\u200b","").strip()

def _is_safe_select(sql: str) -> bool:
    """Single SELECT only; no DDL/DML keywords; no mid-string semicolons."""
    s = sql.strip()
    if ";" in s[:-1]:  # allow a single trailing semicolon only
        return False
    if not _SELECT_ONLY.match(s):
        return False
    low = f" {s.lower()} "
    return not any(b in low for b in _BANNED)

def _to_psycopg2_named(sql: str) -> str:
    """Convert :name placeholders to %(name)s for psycopg2."""
    return _PARAM.sub(r"%(\1)s", sql)

# --- main function ---
def execute_sql(sql: str, params: dict = None, timeout_ms: int = 5000):
    """
    Execute a single SELECT query safely and return rows as a list of dicts.

    Args:
      sql: SQL string (can use :param style placeholders, e.g., :patient_id)
      params: dict of parameters if placeholders are used (optional)
      timeout_ms: statement timeout in milliseconds (default 5000)

    Returns:
      List[Dict]: each row as a dict
    """
    raw = _clean_sql(sql)
    if not _is_safe_select(raw):
        raise ValueError("Blocked: SQL must be a single SELECT without DDL/DML.")

    q = _to_psycopg2_named(raw)
    params = params or {}

    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        host=DB_HOST,
        port=5432,
        connect_timeout=5,
        options=f"-c statement_timeout={timeout_ms}"
    )
    try:
        with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
            cur.execute(q, params)
            return [dict(r) for r in cur.fetchall()]
    finally:
        conn.close()


In [None]:
# Test

# Simple
rows = execute_sql("SELECT COUNT(*) AS n FROM patients;")
print(rows)


[{'n': 111}]


In [None]:
rows = execute_sql("select patient_id from patients;")
print(rows[:3])

[{'patient_id': '8c8e1c9a-b310-43c6-33a7-ad11bad21c40'}, {'patient_id': '782001bc-f712-50ae-04f5-9a488f3ef4aa'}, {'patient_id': '80e7f50a-3e99-d5ac-cf97-f8a4b4f9e6c7'}]


In [None]:
execute_sql("""
SELECT column_name
FROM information_schema.columns
WHERE table_schema='public' AND table_name='encounters'
ORDER BY 1;
""")

[{'column_name': 'class'},
 {'column_name': 'encounter_id'},
 {'column_name': 'end_datetime'},
 {'column_name': 'patient_id'},
 {'column_name': 'reason_text'},
 {'column_name': 'start_datetime'}]

In [None]:
execute_sql("""
SELECT table_name, column_name
FROM information_schema.columns
WHERE table_schema='public'
ORDER BY 1;
""")

[{'table_name': 'conditions', 'column_name': 'onset_datetime'},
 {'table_name': 'conditions', 'column_name': 'patient_id'},
 {'table_name': 'conditions', 'column_name': 'encounter_id'},
 {'table_name': 'conditions', 'column_name': 'code'},
 {'table_name': 'conditions', 'column_name': 'display'},
 {'table_name': 'conditions', 'column_name': 'condition_id'},
 {'table_name': 'conditions', 'column_name': 'abatement_datetime'},
 {'table_name': 'encounters', 'column_name': 'reason_text'},
 {'table_name': 'encounters', 'column_name': 'start_datetime'},
 {'table_name': 'encounters', 'column_name': 'end_datetime'},
 {'table_name': 'encounters', 'column_name': 'encounter_id'},
 {'table_name': 'encounters', 'column_name': 'patient_id'},
 {'table_name': 'encounters', 'column_name': 'class'},
 {'table_name': 'immunizations', 'column_name': 'base_cost'},
 {'table_name': 'immunizations', 'column_name': 'code'},
 {'table_name': 'immunizations', 'column_name': 'date'},
 {'table_name': 'immunizations', 

In [None]:
# With parameter (your canonical pattern)
q = """
SELECT DISTINCT ON (mr.patient_id, mr.med_name)
  mr.patient_id,
  mr.med_name AS medication,
  mr.dose,
  mr.route,
  mr.start_datetime,
  mr.end_datetime,
  mr.refills
FROM medication_requests mr
WHERE mr.patient_id = :patient_id
  AND (mr.end_datetime IS NULL OR mr.end_datetime >= NOW())
ORDER BY mr.patient_id,
         mr.med_name,
         COALESCE(mr.end_datetime, mr.start_datetime) DESC NULLS LAST;

"""
rows = execute_sql(q, {"patient_id": '8c8e1c9a-b310-43c6-33a7-ad11bad21c40'})
print(rows[:3])

[{'patient_id': '8c8e1c9a-b310-43c6-33a7-ad11bad21c40', 'medication': 'Acetaminophen 325 MG Oral Tablet', 'dose': None, 'route': None, 'start_datetime': None, 'end_datetime': None, 'refills': None}, {'patient_id': '8c8e1c9a-b310-43c6-33a7-ad11bad21c40', 'medication': 'Naproxen sodium 220 MG Oral Tablet', 'dose': None, 'route': None, 'start_datetime': None, 'end_datetime': None, 'refills': None}]


# Tie LLM output to return results from PostGre SQL database

In [None]:
def answer_patient_question(user_question: str, patient_id: str, k: int = 3, max_tokens: int = 400 ):
    sql = generate_sql(user_question, k=k, max_tokens=max_tokens)
    #print("answer_patient_question: Generated SQL:\n", sql, "\n")
    rows = execute_sql(sql, {"patient_id": patient_id})
    #print("answer_patient_question: Executed SQL returned rows:\n", len(rows), "\n")
    return sql, rows


In [None]:
execute_sql("select p.patient_id from patients p")

[{'patient_id': '8c8e1c9a-b310-43c6-33a7-ad11bad21c40'},
 {'patient_id': '782001bc-f712-50ae-04f5-9a488f3ef4aa'},
 {'patient_id': '80e7f50a-3e99-d5ac-cf97-f8a4b4f9e6c7'},
 {'patient_id': 'edc17058-55fb-08c7-12df-ece93a402e50'},
 {'patient_id': '9f9dbdcb-23a1-82cc-b7bc-e0e420a95bd1'},
 {'patient_id': 'be874504-c868-ebfd-9a77-df6b1e5ff6cc'},
 {'patient_id': '30e48e16-2df7-207e-7a3d-1650ef0c1ed8'},
 {'patient_id': '57b21dea-ff00-6c3e-92d9-91c7627f53b2'},
 {'patient_id': 'a3d34c1f-5421-e078-38ec-1498a5941dbe'},
 {'patient_id': 'e83fe1b3-f94f-5591-f851-1da300e24e99'},
 {'patient_id': 'e6705c33-7349-8b12-484d-3b1f93227178'},
 {'patient_id': '2da86d63-34ae-b887-ddff-8f6f1e6990f1'},
 {'patient_id': '04181caa-fcc1-c6c8-743e-a903eff368de'},
 {'patient_id': '20802592-1c31-7339-4c4c-2fe648e1a716'},
 {'patient_id': '406e8bad-81b5-7624-5b8a-4aeeb74028f5'},
 {'patient_id': 'a331b5bc-cbea-a205-a8bf-dbf3255ef36a'},
 {'patient_id': '641efcda-7397-4172-c6ac-8231342fa53e'},
 {'patient_id': 'e64918a6-528c-

In [None]:
sql = (
    "SELECT e.patient_id, e.start_datetime, e.end_datetime, "
    "e.class AS encounter_class, e.reason_text AS reason "
    "FROM encounters e "
    "WHERE e.reason_text IS NOT NULL;"
)
execute_sql(sql)

[]

In [None]:
import pandas as pd

patient_id = '0fca905f-391c-08d3-4b93-b53f69b9da53'
user_q = "List all my vital signs"

sql, rows = answer_patient_question(user_q, patient_id, k=5, max_tokens=1000)
print("Generated SQL:\n", sql, "\n")
print("Rows:", len(rows))
if rows:
    display(pd.DataFrame(rows).head(10))

Generated SQL:
 SELECT
  o.patient_id,
  COALESCE(o.display, o.loinc_code) AS vital_name,
  o.value_num AS value,
  o.value_unit AS unit,
  o.effective_datetime
FROM observations AS o
WHERE o.patient_id = :patient_id
  AND (
    o.loinc_code IN ('8480-6','8462-4','8867-4','9279-1','8310-5','59408-5','29463-7','39156-5','8302-2')
    OR LOWER(o.display) IN (
      'systolic blood pressure','diastolic blood pressure',
      'heart rate','respiratory rate','body temperature',
      'oxygen saturation','body weight','bmi','body mass index','body height'
    )
  )
ORDER BY
  o.effective_datetime DESC NULLS LAST,
  COALESCE(o.display, o.loinc_code); 

Rows: 56


Unnamed: 0,patient_id,vital_name,value,unit,effective_datetime
0,0fca905f-391c-08d3-4b93-b53f69b9da53,Body Height,183.4,cm,2025-04-21 14:19:34+00:00
1,0fca905f-391c-08d3-4b93-b53f69b9da53,Body mass index (BMI) [Ratio],28.35,kg/m2,2025-04-21 14:19:34+00:00
2,0fca905f-391c-08d3-4b93-b53f69b9da53,Body Weight,95.4,kg,2025-04-21 14:19:34+00:00
3,0fca905f-391c-08d3-4b93-b53f69b9da53,Heart rate,69.0,/min,2025-04-21 14:19:34+00:00
4,0fca905f-391c-08d3-4b93-b53f69b9da53,Respiratory rate,14.0,/min,2025-04-21 14:19:34+00:00
5,0fca905f-391c-08d3-4b93-b53f69b9da53,Body Height,183.4,cm,2024-04-15 14:19:34+00:00
6,0fca905f-391c-08d3-4b93-b53f69b9da53,Body mass index (BMI) [Ratio],28.26,kg/m2,2024-04-15 14:19:34+00:00
7,0fca905f-391c-08d3-4b93-b53f69b9da53,Body Weight,95.1,kg,2024-04-15 14:19:34+00:00
8,0fca905f-391c-08d3-4b93-b53f69b9da53,Heart rate,83.0,/min,2024-04-15 14:19:34+00:00
9,0fca905f-391c-08d3-4b93-b53f69b9da53,Respiratory rate,15.0,/min,2024-04-15 14:19:34+00:00


# Post data frame, LLM genberates a nice summary

In [None]:
import pandas as pd
from openai import OpenAI
import io

client = OpenAI()  # assumes OPENAI_API_KEY is set


In [None]:
def df_to_csv_for_llm(df: pd.DataFrame, max_rows: int = 200, null_marker: str = "—") -> tuple[str, bool]:
    """
    Convert a DataFrame to CSV for the LLM.
    - Truncates to max_rows to keep prompts small.
    - Replaces NaNs with a visible marker (default "—").
    Returns (csv_text, truncated_flag).
    """
    truncated = False
    if len(df) > max_rows:
        df = df.head(max_rows).copy()
        truncated = True

    df = df.copy()
    df = df.fillna(null_marker)

    # keep column order stable
    csv_buf = io.StringIO()
    df.to_csv(csv_buf, index=False)
    return csv_buf.getvalue(), truncated

def summarize_df_with_llm(
    df: pd.DataFrame,
    patient_id: str,
    model: str = "gpt-4o-mini",
    max_rows: int = 200,
    null_marker: str = "—",
    max_tokens: int = 500
) -> str:
    """
    Ask the LLM to summarize a DataFrame.
    - Includes all visible (non-missing) values in its reasoning context,
      but the model will produce a concise natural-language summary (not a reprint of all rows).
    """
    if df is None or df.empty:
        return "No data found for the requested query."

    csv_text, truncated = df_to_csv_for_llm(df, max_rows=max_rows, null_marker=null_marker)
    columns_csv = ",".join(list(df.columns))

    user_prompt = f"""
You are a precise medical data summarizer. Use only the table below.
- Do not invent values or fields.
- Call out trends, counts, notable recency, and any obvious gaps (fields marked "{null_marker}").
- Keep it concise (2–5 sentences).
- If the table was truncated, say so and include how many rows were shown.

Patient: {patient_id}
Columns: {columns_csv}
Rows shown: {min(len(df), max_rows)}{' (truncated)' if truncated else ''}

CSV:
{csv_text}
""".strip()

    resp = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": "You are a precise, conservative medical data summarizer."},
            {"role": "user", "content": user_prompt},
        ],
        max_tokens=max_tokens,
    )
    return resp.choices[0].message.content.strip()


In [None]:
# ##
# patient_id = '0fca905f-391c-08d3-4b93-b53f69b9da53'
# user_q = "What has been my highest weight"

# sql, rows = answer_patient_question(user_q, patient_id, k=5, max_tokens=1000)
# print("Generated SQL:\n", sql, "\n")
# print("Rows:", len(rows))
# # if rows:
# #     display(pd.DataFrame(rows).head(10))
# df = pd.DataFrame(rows)
# display(df.head(10))
# # 3) Summarize with the LLM
# summary = summarize_df_with_llm(df, patient_id="<REAL-PATIENT-ID>", model=LLM_MODEL)
# print(summary)

# Evaluation



1.   For now we will use Open AI as LLM for generating Optimized response summary, and score evaluation. Down the line we may change this to e.g. Calude




In [None]:
%%capture
pip install -U langgraph

In [None]:
%%capture
pip install -U langchain_core langchain_openai

In [None]:
from pydantic import BaseModel, Field

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [None]:
from typing import TypedDict
from typing import Literal
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate


def generate_empathy_score(user_query: str, response: str):

    #Graph State
    class State(TypedDict):
      user_query: str
      response: str
      empathy_score: str
      is_empathetic: bool


    #Schema for structured outout to use in evaluation
    class Feedback(BaseModel):
      score: Literal ["1","2","3","4","5"] = Field(
          description="Provide a score for how empathetic is the response. 5 is highest empathy."
      )
      feedback: str = Field(
          description="Provide a score for how empathetic is the response. Score of 1: response has no empathy. Score of 5: response has excellent empathy for a human."

      )

    evaluator = llm.with_structured_output(Feedback)

    seeded_data = {
    "user_query": user_query,
    "response": response
    }
    # --- 3. CREATE PROMPT TEMPLATE ---

    # Define the evaluation instructions using a system message and placeholders
    prompt_template = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """You are an expert, highly empathetic evaluator.
                Your task is to judge the system generated response to a user (patient) query.
                You must provide a score (1-5) and detailed feedback on the empathy level of the response, strictly following the provided JSON schema.

                --- Context ---
                User Query: {user_query}
                Response to Evaluate: {response}
                """
            ),
            ("human", "Evaluate the response for empathy and provide the JSON output.")
        ]
    )

    # Prepare the data for the prompt
    # We convert the table to a string for the prompt
    prompt_data = {
        "user_query": seeded_data["user_query"],
        "response": seeded_data["response"]
    }

    # --- 4. RUN EVALUATION ---

    # Create the runnable chain: Prompt -> LLM with Structured Output
    evaluation_chain = prompt_template | evaluator

    # Invoke the chain
    try:
        evaluation_result: Feedback = evaluation_chain.invoke(prompt_data)

        # Output the result
        print("--- EVALUATION RESULT ---")
        print(f"Empathy Score: {evaluation_result.score}/5")
        print(f"Feedback: {evaluation_result.feedback}")

        return {"empathy_score": evaluation_result.score, "feedback": evaluation_result.feedback}
    except Exception as e:
        print(f"An error occurred during evaluation: {e}")




In [None]:
#Testing of empathy score

patient_id = '0fca905f-391c-08d3-4b93-b53f69b9da53'
user_q = 'which medication am I taking?'

sql, rows = answer_patient_question(user_q, patient_id, k=5, max_tokens=1000)
df = pd.DataFrame(rows)
summary = summarize_df_with_llm(df, patient_id=patient_id, model=LLM_MODEL)


In [None]:
summary

'The table shows medication records for a single patient (ID: 0fca905f-391c-08d3-4b93-b53f69b9da53) with five entries. All medications listed, including Acetaminophen, oxyCODONE, and amLODIPine, have a dose of 1.0, but the route, start and end dates, and refills are not provided (marked as "—"). Notably, there is a gap for the Naproxen sodium entry, which lacks dosage information. The data appears truncated, showing only 5 rows.'

In [None]:
generate_empathy_score(user_q,summary)

--- EVALUATION RESULT ---
Empathy Score: 1/5
Feedback: The response lacks any empathetic language or acknowledgment of the user's situation. It is purely factual and does not address the user's need for clarity or support regarding their medication. There is no attempt to connect with the user emotionally or to provide reassurance, which is essential in a healthcare context.


{'empathy_score': '1',
 'feedback': "The response lacks any empathetic language or acknowledgment of the user's situation. It is purely factual and does not address the user's need for clarity or support regarding their medication. There is no attempt to connect with the user emotionally or to provide reassurance, which is essential in a healthcare context."}

# Now after these trial runs we aspire to optimize the empathy score

In [None]:
from typing import TypedDict, Literal, Optional
import pandas as pd
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage # NEW IMPORT
from langgraph.graph import StateGraph, START, END

# --- 1. CONFIGURATION AND SCHEMA ---

# Note: This requires the OPENAI_API_KEY environment variable to be set.
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# 1.1 Graph State (TypedDict)
class PatientState(TypedDict):
    """The state of the conversation and evaluation."""
    patient_query: str
    patient_data: str # Using str to hold Markdown representation of DataFrame
    response: str
    empathy_score: int
    evaluation_feedback: str
    retry_count: int

# 1.2 Structured Output Schema (Pydantic)
class EmpathyEvaluation(BaseModel):
    """Schema for the structured output of the evaluation LLM."""
    score: Literal[1, 2, 3, 4, 5] = Field(
        description="The empathy score for the response (1=low empathy, 5=high empathy)."
    )
    feedback: str = Field(
        description="Constructive, actionable feedback on how to improve the response's empathy, especially if the score is 3 or less."
    )

# Augment the LLM for structured evaluation output
evaluator_llm = llm.with_structured_output(EmpathyEvaluation)


# --- 2. GRAPH NODES ---

def generate_response(state: PatientState) -> dict:
    """
    Node 1: Generates or regenerates a patient response using the LLM.
    """
    print(f"\n--- GENERATING RESPONSE (Retry: {state['retry_count']}) ---")

    # Base instructions content
    system_prompt_content = (
        "You are a highly empathetic and professional clinical assistant. "
        "Your primary goal is to acknowledge the patient's feelings and provide clear, gentle, and helpful next steps. "
        "The current patient query and their data are provided below. Respond with empathy first."
    )

    # Context message content (always included)
    context_message_content = (
        f"Patient Query: {state['patient_query']}\nPatient Data Snapshot:\n{state['patient_data']}"
    )

    # Determine user prompt content
    if state["retry_count"] > 0:
        # Retry prompt: incorporate previous feedback
        user_prompt_content = (
            f"RETRY RESPONSE. The previous attempt scored {state['empathy_score']}/5. "
            f"The feedback was: '{state['evaluation_feedback']}'. "
            f"Write a new response that explicitly addresses this feedback and improves empathy."
        )
    else:
        # Initial prompt
        user_prompt_content = "Generate a response."

    # Construct the final list of BaseMessages (REQUIRED INPUT TYPE)
    messages = [
        SystemMessage(content=system_prompt_content),
        HumanMessage(content=context_message_content),
        HumanMessage(content=user_prompt_content)
    ]

    # Invoke LLM with the list of messages
    msg = llm.invoke(messages)

    return {
        "response": msg.content,
        "retry_count": state["retry_count"] + 1,
        "evaluation_feedback": "" # Clear feedback before next evaluation
    }


def evaluate_response(state: PatientState) -> dict:
    """
    Node 2: Evaluates the generated response for empathy using the structured LLM.
    """
    print("\n--- EVALUATING RESPONSE ---")

    # Prompt for the structured evaluator
    evaluation_prompt = (
        f"Evaluate the following clinical assistant response for empathy on a scale of 1 to 5. "
        f"Score 1 means cold/robotic. Score 5 means outstanding empathy and warmth. "
        f"Patient Query: {state['patient_query']}\n"
        f"Assistant Response: {state['response']}"
    )

    # Invoke the structured LLM
    evaluation_result: EmpathyEvaluation = evaluator_llm.invoke(evaluation_prompt)

    print(f"-> Score: {evaluation_result.score}/5")

    return {
        "empathy_score": evaluation_result.score,
        "evaluation_feedback": evaluation_result.feedback
    }


def route_to_retry(state: PatientState) -> str:
    """
    Conditional Edge: Decides whether to loop back for regeneration or end.
    """
    score = state["empathy_score"]
    max_retries = 3 # Safety limit to prevent infinite loops

    if score > 3:
        print("\n*** ROUTE: END (Score > 3) ***")
        return "end"
    elif state["retry_count"] >= max_retries:
        print(f"\n*** ROUTE: END (Max Retries ({max_retries}) Reached) ***")
        return "end"
    else:
        print("\n*** ROUTE: RETRY (Score <= 3) ***")
        return "retry"


# --- 3. WORKFLOW ASSEMBLY AND MAIN FUNCTION ---

def optimize_patient_response(query: str, data: pd.DataFrame, max_retries=3):
    """
    Main function to run the optimization graph.

    Args:
        query (str): The patient's input question or concern.
        data (pd.DataFrame): Relevant patient data.
        max_retries (int): Maximum number of attempts before stopping.

    Returns:
        dict: The final state of the graph, including the optimized response.
    """

    # Initialize the graph builder
    optimizer_builder = StateGraph(PatientState)

    # Add the nodes (actions)
    optimizer_builder.add_node("generator", generate_response)
    optimizer_builder.add_node("evaluator", evaluate_response)

    # Define the entry point
    optimizer_builder.add_edge(START, "generator")

    # Define the flow from generation to evaluation
    optimizer_builder.add_edge("generator", "evaluator")

    # Define the conditional loop after evaluation
    optimizer_builder.add_conditional_edges(
        "evaluator",
        route_to_retry,
        {
            "retry": "generator", # Loop back to generator
            "end": END,           # Stop the workflow
        }
    )

    # Compile the workflow
    optimizer_workflow = optimizer_builder.compile()

    # Initial state preparation
    initial_state = {
        "patient_query": query,
        "patient_data": data.to_markdown(), # Convert DataFrame head to readable string
        "response": "",
        "empathy_score": 0,
        "evaluation_feedback": "",
        "retry_count": 0
    }

    print(f"--- STARTING OPTIMIZATION (Max Retries: {max_retries}) ---")

    # Invoke the workflow
    final_state = optimizer_workflow.invoke(initial_state, {"recursion_limit": max_retries + 2})

    return final_state

In [None]:
# Run the optimization process
optimized_result = optimize_patient_response(
  query=user_q,
  data=df,
  max_retries=3 # Will stop after 3 attempts, even if score < 4
)

print("\n==============================================")
print("      FINAL OPTIMIZED RESPONSE & RESULTS")
print("==============================================")
print(f"Query: {optimized_result['patient_query']}")
print(f"Final Score: {optimized_result['empathy_score']}/5")
print(f"Final Response:\n{optimized_result['response']}")

--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---
-> Score: 4/5

*** ROUTE: END (Score > 3) ***

      FINAL OPTIMIZED RESPONSE & RESULTS
Query: which medication am I taking?
Final Score: 4/5
Final Response:
I understand that it can be a bit confusing to keep track of your medications, and I'm here to help you with that. Based on your records, you are currently taking the following medications:

1. **Acetaminophen 325 MG Oral Tablet (Tylenol)**
2. **Acetaminophen 325 MG / OxyCODONE Hydrochloride 5 MG Oral Tablet**
3. **Amlodipine 2.5 MG Oral Tablet**
4. **Naproxen Sodium 220 MG Oral Tablet**

If you have any questions about these medications, such as their purpose or how to take them, please feel free to ask. It's important to feel comfortable and informed about your treatment.


**Test 2**




**Test 3**

**Test 4**

**Test 5**

**Test 6**

**Test 7**

**Test 8**

**Test 9**

# LJ empathy optimized response (LLM as a Judge scores optimized response)



*   Relevance of response to patient question



In [None]:
user_question = "When was my last cholesterol test?"
portal_response = "Your most recent cholesterol test was on May 2, 2024, showing normal results."

judge_prompt_R = f"""
You are an expert judge that evaluates the quality of a response provided by a patient question-answering LLM system to a patient’s query.
Your task is to provide a Relevance score (1–5) for how well the system’s response answers the patient’s question about their own medical history or records.

***Criterion Definition***
RELEVANCE: The degree to which the response directly and appropriately addresses the specific patient question about their medical record, history, medications, lab results, appointments, or related data.

***Score Definitions and Examples***

1: NOT RELEVANT
- Meaning: The response does not relate to the patient’s question.
- Example:
  Q: “What medications am I currently taking?”
  A: “You should eat a balanced diet.”
  → Not relevant.

2: SLIGHTLY RELEVANT
- Meaning: The response touches the general topic but not the patient’s record.
- Example:
  Q: “When was my last blood test?”
  A: “Blood tests are usually done annually.”
  → Only generic information.

3: MODERATELY RELEVANT
- Meaning: The response partially answers or hints at relevant data but misses key specifics.
- Example:
  Q: “What medications am I currently taking?”
  A: “You are prescribed metformin, but I’m not sure about your other medications.”
  → Partial relevance.

4: HIGHLY RELEVANT
- Meaning: The response clearly addresses the question using mostly correct patient-specific data, minor omissions acceptable.
- Example:
  Q: “When was my last lab test?”
  A: “Your most recent lab test was in March 2024 for blood glucose.”
  → Very good relevance.

5: PERFECTLY RELEVANT
- Meaning: The response is direct, complete, and fully focused on the patient’s question and personal data.
- Example:
  Q: “What medications am I currently taking?”
  A: “You are currently taking Metformin 500 mg twice daily and Lisinopril 10 mg once daily.”
  → Excellent relevance.

Now evaluate the following pair:

User Question: {user_question}
Portal Response: {portal_response}

***Output Format***
Provide your output strictly as a JSON object:
- "User Question": "{user_question}",
- "Portal Response": "{portal_response}",
- "Relevance Score": <integer between 1 and 5>

Do not include explanations, reasoning, or any text outside this JSON object.
"""


In [None]:
df_as_text = df.to_markdown(index=False)

# Build the full Faithfulness evaluation prompt
judge_prompt_F = f"""You are an expert medical data evaluator assessing the factual correctness of a patient-portal system's response.

Your task is to provide a Faithfulness score (1–5) indicating how accurately the response reflects the factual information in the retrieved records.

***Criterion Definition***
FAITHFULNESS: The degree to which the response content is factually consistent with the evidence provided in the retrieved records below.
Do NOT judge relevance or helpfulness — only whether the statements are supported by the retrieved data.

***Score Definitions (with Domain-Relevant Examples)***

1 — NOT FAITHFUL
- Meaning: Mostly incorrect or hallucinated; contradicts the records.
- Example Evidence (Medications):
  | medication | dose  | freq |
  |------------|-------|------|
  | Metformin  | 500mg | BID  |
- Example Response: "You’re taking Metformin 500 mg BID and Atorvastatin 20 mg QD."
- Why: Atorvastatin is not in evidence → contradiction.

2 — SLIGHTLY FAITHFUL
- Meaning: Some correct facts, but significant unsupported claims remain.
- Example Evidence (Labs):
  | test | date       | result |
  |------|------------|--------|
  | A1C  | 2024-03-10 | 6.9%   |
- Example Response: "Your latest A1C on 2024-03-10 was 6.9%, and your LDL was 85 mg/dL."
- Why: A1C detail is correct; LDL is unsupported → major error persists.

3 — MODERATELY FAITHFUL
- Meaning: Mostly supported, but one or two factual inaccuracies (e.g., wrong dose/date).
- Example Evidence (Medications):
  | medication | dose | freq |
  |------------|------|------|
  | Lisinopril | 10mg | QD   |
- Example Response: "You take Lisinopril 5 mg once daily."
- Why: Correct med and frequency; dose is wrong → partial mismatch.

4 — HIGHLY FAITHFUL
- Meaning: Essentially correct; only minor discrepancy/omission (e.g., missing a less salient detail).
- Example Evidence (Appointments):
  | type             | date       | status    |
  |------------------|------------|-----------|
  | Cardio follow-up | 2025-06-14 | completed |
- Example Response: "Your last cardiology follow-up was in June 2025."
- Why: Month/year correct; missing exact day → minor omission.

5 — PERFECTLY FAITHFUL
- Meaning: Fully supported; no contradictions or unsupported claims.
- Example Evidence (Medications):
  | medication | dose  | freq |
  |------------|-------|------|
  | Metformin  | 500mg | BID  |
  | Lisinopril | 10mg  | QD   |
- Example Response: "You are taking Metformin 500 mg twice daily and Lisinopril 10 mg once daily."
- Why: Every stated fact matches the records exactly.

***Retrieved Records***
```markdown
{df_as_text}
Response to Evaluate
{portal_response}
Output Format
Return ONLY the following JSON object (no extra text):

{{
"User Question": "{user_question}",
"Portal Response": "{portal_response}",
"Faithfulness Score": <integer between 1 and 5>
}}

Judging Rules

Use ONLY the provided records for verification; do not infer or assume missing facts.
Formatting differences (e.g., "twice daily" vs "BID") are acceptable if semantically identical.
If no evidence supports a claimed fact, treat it as unsupported.
"""

In [None]:
# Build the Helpfulness evaluation prompt
judge_prompt_H = f"""
You are an expert medical communication evaluator. Judge how HELPFUL the patient-portal system's response is to the patient.

Your task is to provide a Helpfulness score (1–5) indicating whether the response meaningfully helps the patient act on or understand their own medical record question.

***Criterion Definition***
HELPFULNESS: The degree to which the response is immediately useful to the patient—i.e., it answers the question clearly and provides actionable guidance (when appropriate), next steps, or pointers to where the patient can find the information in their portal. Do NOT judge factual correctness (that's Faithfulness) or topicality (that's Relevance).

***Score Definitions (with Patient-Portal Examples)***

1 — NOT HELPFUL
- Meaning: Off-topic, evasive, or gives no usable guidance.
- Example Q: "When was my last cholesterol test?"
- Example A: "Cholesterol is a type of fat in the blood."
- Why: No answer, no steps.

2 — SLIGHTLY HELPFUL
- Meaning: Some topical info but still not actionable or patient-focused.
- Example Q: "What meds am I currently taking?"
- Example A: "Medications are often used to manage chronic conditions."
- Why: General info; provides no path to the patient’s own info.

3 — MODERATELY HELPFUL
- Meaning: Partially useful; gives a hint or partial answer but lacks clarity or next steps.
- Example Q: "Do I have any upcoming appointments?"
- Example A: "You can check upcoming visits in the Appointments section."
- Why: Direction is given, but no direct status or concrete steps (e.g., exact path).

4 — HIGHLY HELPFUL
- Meaning: Clear, patient-centered answer with minor omissions; includes concise steps or references.
- Example Q: "Where can I see my vaccine history?"
- Example A: "Open **Health Records → Immunizations** to view your vaccine history. If you don’t see recent entries, message your care team from **Messages → New Message**."
- Why: Concrete portal path and next step.

5 — PERFECTLY HELPFUL
- Meaning: Direct, concise answer tailored to the question, with clear steps and guardrails when appropriate.
- Example Q: "How can I refill my Metformin?"
- Example A: "Go to **Medications → Metformin → Refill**. If 'Refill' isn’t available, use **Messages → Pharmacy Request** or call the number on your prescription label. If you’re out of doses today, contact your pharmacy for an emergency supply."
- Why: Immediate action path, fallback, and safety guidance.

***Judging Rules***
- Focus ONLY on usefulness to the patient: clarity, directness, actionable steps, and appropriate guardrails.
- Prefer concise, structured guidance (e.g., portal navigation paths, who to contact, what to click).
- Do not penalize for not repeating data that belongs in the chart; penalize if it fails to guide the patient to it.
- Don’t reward unnecessary medical theory/jargon if it doesn’t help the patient do something.
- If the response can’t answer due to missing data, helpfulness increases when it explicitly states that and gives exact next steps to obtain it (e.g., where to look or whom to message).
- Safety: Encourage contacting clinicians for urgent symptoms; do not judge medical accuracy here.

***Input Pair to Evaluate***
User Question:
{user_question}

Portal Response:
{portal_response}

***Output Format***
Return ONLY the following JSON object (no extra text):

{{
  "User Question": "{user_question}",
  "Portal Response": "{portal_response}",
  "Helpfulness Score": <integer between 1 and 5>
}}
"""


# MedGemma-27B-text-it



*   MedGemma-27B-text-it instruction tuned better accuracy as a judge than Mistral 7B
*   QLoRA use on A100 40GB



In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from huggingface_hub import login # Import the login function

# Log in to Hugging Face
try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN') # Assuming you have stored your HF token in Colab Secrets
    if hf_token:
        login(token=hf_token)
    else:
        print("Hugging Face token not found in Colab Secrets. Please add it as 'HF_TOKEN'.")
except Exception:
    print("Could not retrieve Hugging Face token from Colab Secrets.")
    # Fallback to manual input if needed, but not recommended for security
    # import getpass
    # hf_token = getpass.getpass("Enter your Hugging Face token: ")
    # login(token=hf_token)


model_id = "google/medgemma-27b-text-it"  # or the exact HF ID you use

bnb_cfg = BitsAndBytesConfig(load_in_4bit=True,
                             bnb_4bit_quant_type="nf4",
                             bnb_4bit_compute_dtype=torch.bfloat16,
                             bnb_4bit_use_double_quant=True)

tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_cfg,
    torch_dtype=torch.bfloat16
)



tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/931 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

model-00007-of-00011.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00005-of-00011.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00006-of-00011.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00011.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00011.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00004-of-00011.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00011.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00008-of-00011.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00009-of-00011.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00010-of-00011.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00011-of-00011.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/11 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
def chat(prompt, max_new_tokens=256, temperature=0.0):
    inputs = tok(prompt, return_tensors="pt").to(model.device)

    # --- The necessary fix is here: determine do_sample based on temperature ---
    # When temperature is 0.0, you want greedy decoding (do_sample=False).
    # Otherwise, you want sampling (do_sample=True).
    do_sample_setting = temperature > 0.0

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            # Pass the corrected setting to the model.generate function
            do_sample=do_sample_setting
        )
    return tok.decode(out[0], skip_special_tokens=True)

# Example: Run with greedy decoding (temperature=0.0)
# response_greedy = chat("What is the primary treatment for strep throat?", temperature=0.0)

# Example: Run with sampling (temperature=0.7)
# response_creative = chat("Write a short poem about a doctor's morning rounds.", temperature=0.7)

In [None]:
import pandas as pd
def create_payload(patient_id:str, user_q:str, k=5, max_tokens=1000, model=LLM_MODEL):
    """For a patient id, and a user question, returns
    Sql, df, non-empathy-optimized summary, optimized summary, optimized score
    returns a dict."""
    sql, rows = answer_patient_question(user_q, patient_id, k, max_tokens)
    df = pd.DataFrame(rows)
    non_optimized_response = summarize_df_with_llm(df, patient_id=patient_id, model=LLM_MODEL)

    optimized_result = optimize_patient_response(
          query=user_q,
          data=df,
          max_retries=3 # Will stop after 3 attempts, even if score < 4
          )

    optimized_emp_score = optimized_result['empathy_score']
    optimized_response = optimized_result['response']

    dict = {
        "sql": sql,
        "df" : df.to_markdown(),
        "non_opt_resp": non_optimized_response,
        "opt_score": optimized_emp_score,
        "opt_response": optimized_response

    }

    return dict


In [None]:
# ---------- Relevance ----------
judge_prompt_R = """
You are an expert judge that evaluates the quality of a response provided by a patient question-answering LLM system to a patient's query.
Your task is to provide a Relevance score (1–5) for how well the system's response answers the patient's question about their own medical history or records.

***Criterion Definition***
RELEVANCE: The degree to which the response directly and appropriately addresses the specific patient question about their medical record, history, medications, lab results, appointments, or related data.

***Score Definitions and Examples***

1: NOT RELEVANT
- Meaning: The response does not relate to the patient's question.
- Example:
  Q: "What medications am I currently taking?"
  A: "You should eat a balanced diet."
  → Not relevant.

2: SLIGHTLY RELEVANT
- Meaning: The response touches the general topic but not the patient's record.
- Example:
  Q: "When was my last blood test?"
  A: "Blood tests are usually done annually."
  → Only generic information.

3: MODERATELY RELEVANT
- Meaning: The response partially answers or hints at relevant data but misses key specifics.
- Example:
  Q: "What medications am I currently taking?"
  A: "You are prescribed metformin, but I'm not sure about your other medications."
  → Partial relevance.

4: HIGHLY RELEVANT
- Meaning: The response clearly addresses the question using mostly correct patient-specific data, minor omissions acceptable.
- Example:
  Q: "When was my last lab test?"
  A: "Your most recent lab test was in March 2024 for blood glucose."
  → Very good relevance.

5: PERFECTLY RELEVANT
- Meaning: The response is direct, complete, and fully focused on the patient's question and personal data.
- Example:
  Q: "What medications am I currently taking?"
  A: "You are currently taking Metformin 500 mg twice daily and Lisinopril 10 mg once daily."
  → Excellent relevance.

Now evaluate the following pair:

User Question: {user_question}
Portal Response: {portal_response}

***Output Format***
Provide your output strictly as a JSON object:

{{
  "User Question": "{user_question}",
  "Relevance Score": <integer between 1 and 5>
}}

ONLY output the JSON object. Ensure all string values are on a single line and properly escaped.

Do not include explanations, reasoning, or any text outside this JSON object.
"""

# ---------- Faithfulness ----------
judge_prompt_F = """
You are an expert medical data evaluator assessing the factual correctness of a patient-portal system's response.

Your task is to provide a Faithfulness score (1–5) indicating how accurately the response reflects the factual information in the retrieved records.

***Criterion Definition***
FAITHFULNESS: The degree to which the response content is factually consistent with the evidence provided in the retrieved records below.
Do NOT judge relevance or helpfulness — only whether the statements are supported by the retrieved data.

***Score Definitions (with Domain-Relevant Examples)***

1 — NOT FAITHFUL
- Meaning: Mostly incorrect or hallucinated; contradicts the records.
- Example Evidence (Medications):
  | medication | dose  | freq |
  |------------|-------|------|
  | Metformin  | 500mg | BID  |
- Example Response: "You’re taking Metformin 500 mg BID and Atorvastatin 20 mg QD."
- Why: Atorvastatin is not in evidence → contradiction.

2 — SLIGHTLY FAITHFUL
- Meaning: Some correct facts, but significant unsupported claims remain.
- Example Evidence (Labs):
  | test | date       | result |
  |------|------------|--------|
  | A1C  | 2024-03-10 | 6.9%   |
- Example Response: "Your latest A1C on 2024-03-10 was 6.9%, and your LDL was 85 mg/dL."
- Why: A1C detail is correct; LDL is unsupported → major error persists.

3 — MODERATELY FAITHFUL
- Meaning: Mostly supported, but one or two factual inaccuracies (e.g., wrong dose/date).
- Example Evidence (Medications):
  | medication | dose | freq |
  |------------|------|------|
  | Lisinopril | 10mg | QD   |
- Example Response: "You take Lisinopril 5 mg once daily."
- Why: Correct med and frequency; dose is wrong → partial mismatch.

4 — HIGHLY FAITHFUL
- Meaning: Essentially correct; only minor discrepancy/omission (e.g., missing a less salient detail).
- Example Evidence (Appointments):
  | type             | date       | status    |
  |------------------|------------|-----------|
  | Cardio follow-up | 2025-06-14 | completed |
- Example Response: "Your last cardiology follow-up was in June 2025."
- Why: Month/year correct; missing exact day → minor omission.

5 — PERFECTLY FAITHFUL
- Meaning: Fully supported; no contradictions or unsupported claims.
- Example Evidence (Medications):
  | medication | dose  | freq |
  |------------|-------|------|
  | Metformin  | 500mg | BID  |
  | Lisinopril | 10mg  | QD   |
- Example Response: "You are taking Metformin 500 mg twice daily and Lisinopril 10 mg once daily."
- Why: Every stated fact matches the records exactly.

***Retrieved Records***
```markdown
{df_as_text}
Response to Evaluate
{portal_response}
Output Format
Return ONLY the following JSON object (no extra text):

{{
"User Question": "{user_question}",
"Faithfulness Score": <integer between 1 and 5>
}}

ONLY output the JSON object. Ensure all string values are on a single line and properly escaped.

Judging Rules

Use ONLY the provided records for verification; do not infer or assume missing facts.
Formatting differences (e.g., "twice daily" vs "BID") are acceptable if semantically identical.
If no evidence supports a claimed fact, treat it as unsupported.
"""
#---------- Helpfulness ----------
judge_prompt_H = """
You are an expert medical communication evaluator. Judge how HELPFUL the patient-portal system's response is to the patient.
Your task is to provide a Helpfulness score (1–5) indicating whether the response meaningfully helps the patient act on or understand their own medical record question.

Criterion Definition
HELPFULNESS: The degree to which the response is immediately useful to the patient—i.e., it answers the question clearly and provides actionable guidance (when appropriate), next steps, or pointers to where the patient can find the information in their portal. Do NOT judge factual correctness (that's Faithfulness) or topicality (that's Relevance).

Score Definitions (with Patient-Portal Examples)

1 — NOT HELPFUL

Meaning: Off-topic, evasive, or gives no usable guidance.
Example Q: "When was my last cholesterol test?"
Example A: "Cholesterol is a type of fat in the blood."
Why: No answer, no steps.
2 — SLIGHTLY HELPFUL
Meaning: Some topical info but still not actionable or patient-focused.
Example Q: "What meds am I currently taking?"
Example A: "Medications are often used to manage chronic conditions."
Why: General info; provides no path to the patient's own info.
3 — MODERATELY HELPFUL
Meaning: Partially useful; gives a hint or partial answer but lacks clarity or next steps.
Example Q: "Do I have any upcoming appointments?"
Example A: "You can check upcoming visits in the Appointments section."
Why: Direction is given, but no direct status or concrete steps (e.g., exact path).
4 — HIGHLY HELPFUL
Meaning: Clear, patient-centered answer with minor omissions; includes concise steps or references.
Example Q: "Where can I see my vaccine history?"
Example A: "Open Health Records → Immunizations to view your vaccine history. If you don't see recent entries, message your care team from Messages → New Message."
Why: Concrete portal path and next step.
5 — PERFECTLY HELPFUL
Meaning: Direct, concise answer tailored to the question, with clear steps and guardrails when appropriate.
Example Q: "How can I refill my Metformin?"
Example A: "Go to Medications → Metformin → Refill. If 'Refill' isn't available, use Messages → Pharmacy Request or call the number on your prescription label. If you're out of doses today, contact your pharmacy for an emergency supply."
Why: Immediate action path, fallback, and safety guidance.
Judging Rules
Focus ONLY on usefulness to the patient: clarity, directness, actionable steps, and appropriate guardrails.
Prefer concise, structured guidance (e.g., portal navigation paths, who to contact, what to click).
Do not penalize for not repeating data that belongs in the chart; penalize if it fails to guide the patient to it.
Don't reward unnecessary medical theory/jargon if it doesn't help the patient do something.
If the response can't answer due to missing data, helpfulness increases when it explicitly states that and gives exact next steps to obtain it (e.g., where to look or whom to message).
Safety: Encourage contacting clinicians for urgent symptoms; do not judge medical accuracy here.
Input Pair to Evaluate
User Question:
{user_question}
Portal Response:
{portal_response}

Output Format
Return ONLY the following JSON object (no extra text):

{{
"User Question": "{user_question}",
"Helpfulness Score": <integer between 1 and 5>
}}

ONLY output the JSON object. Ensure all string values are on a single line and properly escaped.
"""

In [None]:
# Run medGemma

import json
import re

import json
import re

def safe_json(s: str) -> dict:
    """
    Safely extracts and parses a JSON object from an LLM's string output,
    handling common control character errors, isolation issues, and syntax errors.
    """
    s = s.strip()

    # 1. Escape unescaped control characters.
    # We escape newlines, tabs, and carriage returns.
    s = s.replace('\n', '\\n').replace('\t', '\\t').replace('\r', '\\r')

    # 2. Use a more targeted regex to find the JSON block.
    # This specifically looks for the object structure.
    # re.DOTALL (re.S) ensures '.' matches newlines inside the object.
    m = re.search(r"(\{.*\}|\[.*\])", s, re.DOTALL)

    if m:
        json_str = m.group(0)

        # 3. CRITICAL FIX: Replace unescaped single quotes with double quotes.
        # This solves the most common LLM syntax error: {'key': 'value'} vs {"key": "value"}
        # This uses an aggressive replacement which works for simple strings, but fails on complex nested JSON where
        # a single quote might be a valid character. The most robust solution is often a custom JSON library like `json5`.
        json_str = json_str.replace("'", '"')

        try:
            # 4. Attempt to load the cleaned and isolated JSON string
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            # If standard parsing still fails, fall back to logging the error
            return {"raw": s, "error": str(e)}
    else:
        # 5. No JSON structure found
        return {"raw": s, "error": "No JSON object found."}

def run_judge_prompts(user_question, portal_response, df_as_text):
    """Run Relevance, Faithfulness, and Helpfulness prompts on MedGemma."""

    # ---- PROMPT 1: Relevance ----
    prompt_R = judge_prompt_R.format(user_question=user_question,
                                     portal_response=portal_response)
    reply_R = chat(prompt_R)

    # ---- PROMPT 2: Faithfulness ----
    prompt_F = judge_prompt_F.format(user_question=user_question,
                                     portal_response=portal_response,
                                     df_as_text=df_as_text)
    reply_F = chat(prompt_F)

    # ---- PROMPT 3: Helpfulness ----
    prompt_H = judge_prompt_H.format(user_question=user_question,
                                     portal_response=portal_response)
    reply_H = chat(prompt_H)

    # Helper to extract JSON safely

    result = {
        "Relevance": safe_json(reply_R),
        "Faithfulness": safe_json(reply_F),
        "Helpfulness": safe_json(reply_H)
    }
    return result


#Run MedGemma

# MedGemma did not follow JSON output instructions, so we piped its reponse to 4 o mini to generate a clear output

In [None]:
from pydantic import BaseModel, Field
from typing import Literal

class JudgeOutput_R(BaseModel):
    """The final, clean, structured output from the evaluation process."""
    user_question: str = Field(description="The original question from the patient.")
    portal_response: str = Field(description="The full response provided by the patient-facing system.")
    relevance_score: Literal[1, 2, 3, 4, 5] = Field(description="The final Relevance score (1-5) based on the criteria provided in the prompt.")

class JudgeOutput_F(BaseModel):
    """The final, clean, structured output from the evaluation process."""
    user_question: str = Field(description="The original question from the patient.")
    portal_response: str = Field(description="The full response provided by the patient-facing system.")
    faithfulness_score: Literal[1, 2, 3, 4, 5] = Field(description="The final Faithfulness score (1-5) based on the criteria provided in the prompt.")

class JudgeOutput_H(BaseModel):
    """The final, clean, structured output from the evaluation process."""
    user_question: str = Field(description="The original question from the patient.")
    portal_response: str = Field(description="The full response provided by the patient-facing system.")
    helpfulness_score: Literal[1, 2, 3, 4, 5] = Field(description="The final Helpfulness score (1-5) based on the criteria provided in the prompt.")




In [None]:
def get_outputJSON_R(rfh_responses, model, JudgeOutput_R):
  """returns a well formatted json response (user_question, portal_response, relevance score)
  for 'Relevance' key in unprocessed rfh_responses. Uses model = gpt-40-mini
  Expects Pydantic schema JudgeOutput_R
  """

  # Get the specific formatting instructions for GPT-4o mini
  parser_R = PydanticOutputParser(pydantic_object=JudgeOutput_R)
  format_instructions_R = parser_R.get_format_instructions()

  correction_prompt_R = ChatPromptTemplate.from_messages([
      ("system",
      "You are a strict JSON formatter. Your task is to extract the 'User Question', 'Portal Response', and the 'Relevance Score' from the RAW TEXT provided below, and reformat it STRICTLY according to the given JSON schema. Output your response as a JSON object."),
      ("human",
      "RAW TEXT (May contain errors, markdown, or extra text):\n---\n{raw_medgemma_output}\n---\n\n{format_instructions_R}") # <-- Still requires this variable
  ])

  correction_prompt_R_fixed = correction_prompt_R.partial(
      format_instructions_R=format_instructions_R
  )
  correction_chain_R = correction_prompt_R_fixed | model | parser_R

  reply_R = rfh_responses['Relevance']['raw']
  validated_output_R = correction_chain_R.invoke({"raw_medgemma_output": reply_R})
  json_string_R = validated_output_R.model_dump_json(indent=4)

  return json_string_R


def get_outputJSON_F(rfh_responses, model, JudgeOutput_F):
  """returns a well formatted json response (user_question, portal_response, faithfulness score)
  for 'Faithfulness' key in unprocessed rfh_responses. Uses model = gpt-40-mini
  Expects Pydantic schema JudgeOutput_F
  """

  # Get the specific formatting instructions for GPT-4o mini
  # 2. Get the specific formatting instructions for GPT-4o mini
  parser_F = PydanticOutputParser(pydantic_object=JudgeOutput_F)
  format_instructions_F = parser_F.get_format_instructions()

  correction_prompt_F = ChatPromptTemplate.from_messages([
      ("system",
      "You are a strict JSON formatter. Your task is to extract the 'User Question', 'Portal Response', and the 'Faithfulness Score' from the RAW TEXT provided below, and reformat it STRICTLY according to the given JSON schema. Output your response as a JSON object."),
      ("human",
      "RAW TEXT (May contain errors, markdown, or extra text):\n---\n{raw_medgemma_output}\n---\n\n{format_instructions_F}") # <-- Still requires this variable
  ])

  correction_prompt_F_fixed = correction_prompt_F.partial(
      format_instructions_F=format_instructions_F
  )
  correction_chain_F = correction_prompt_F_fixed | model | parser_F

  reply_F = rfh_responses['Faithfulness']['raw']
  validated_output_F = correction_chain_F.invoke({"raw_medgemma_output": reply_F})
  json_string_F = validated_output_F.model_dump_json(indent=4)

  return json_string_F


def get_outputJSON_H(rfh_responses, model, JudgeOutput_H):
  """returns a well formatted json response (user_question, portal_response, helpfulness score)
  for 'Helpfulness' key in unprocessed rfh_responses. Uses model = gpt-40-mini
  Expects Pydantic schema JudgeOutput_H
  """

  # Get the specific formatting instructions for GPT-4o mini
  # 2. Get the specific formatting instructions for GPT-4o mini
  # 2. Get the specific formatting instructions for GPT-4o mini
  parser_H = PydanticOutputParser(pydantic_object=JudgeOutput_H)
  format_instructions_H = parser_H.get_format_instructions()

  correction_prompt_H = ChatPromptTemplate.from_messages([
      ("system",
      "You are a strict JSON formatter. Your task is to extract the 'User Question', 'Portal Response', and the 'Helpfulness Score' from the RAW TEXT provided below, and reformat it STRICTLY according to the given JSON schema. Output your response as a JSON object."),
      ("human",
      "RAW TEXT (May contain errors, markdown, or extra text):\n---\n{raw_medgemma_output}\n---\n\n{format_instructions_H}") # <-- Still requires this variable
  ])

  correction_prompt_H_fixed = correction_prompt_F.partial(
      format_instructions_H=format_instructions_H
  )
  correction_chain_H = correction_prompt_H_fixed | model | parser_H

  reply_H = rfh_responses['Helpfulness']['raw']
  validated_output_H = correction_chain_H.invoke({"raw_medgemma_output": reply_H})
  json_string_H = validated_output_H.model_dump_json(indent=4)

  return json_string_H



In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser

# 1. Instantiate the reliable OpenAI model
gpt_parser_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [None]:
##Faithfulness json extraction from gpt 40 mini

# 2. Get the specific formatting instructions for GPT-4o mini
parser_F = PydanticOutputParser(pydantic_object=JudgeOutput_F)
format_instructions_F = parser_F.get_format_instructions()

correction_prompt_F = ChatPromptTemplate.from_messages([
    ("system",
     "You are a strict JSON formatter. Your task is to extract the 'User Question', 'Portal Response', and the 'Faithfulness Score' from the RAW TEXT provided below, and reformat it STRICTLY according to the given JSON schema. Output your response as a JSON object."),
    ("human",
     "RAW TEXT (May contain errors, markdown, or extra text):\n---\n{raw_medgemma_output}\n---\n\n{format_instructions_F}") # <-- Still requires this variable
])

correction_prompt_F_fixed = correction_prompt_F.partial(
    format_instructions_F=format_instructions_F
)
correction_chain_F = correction_prompt_F_fixed | gpt_parser_model | parser_F


In [None]:
##Helpfulness json extraction from gpt 40 mini

# 2. Get the specific formatting instructions for GPT-4o mini
parser_H = PydanticOutputParser(pydantic_object=JudgeOutput_H)
format_instructions_H = parser_H.get_format_instructions()

correction_prompt_F = ChatPromptTemplate.from_messages([
    ("system",
     "You are a strict JSON formatter. Your task is to extract the 'User Question', 'Portal Response', and the 'Helpfulness Score' from the RAW TEXT provided below, and reformat it STRICTLY according to the given JSON schema. Output your response as a JSON object."),
    ("human",
     "RAW TEXT (May contain errors, markdown, or extra text):\n---\n{raw_medgemma_output}\n---\n\n{format_instructions_H}") # <-- Still requires this variable
])

correction_prompt_H_fixed = correction_prompt_F.partial(
    format_instructions_H=format_instructions_H
)
correction_chain_H = correction_prompt_H_fixed | gpt_parser_model | parser_H

In [None]:
####towards a pipeline

def get_RFH_evaluation(patient_id, user_q):
    """
    for a patient id, user_q, returns RFH JSONs for BOTH
    non optimized (empathy) response, and empathy optimized response,
    and the empathy optimized score for the optimized response
    """
    payload = create_payload(patient_id,user_q, model=LLM_MODEL)
    user_question = user_q
    df = payload['df']
    portal_response = payload['opt_response']

    # payload.keys()
    # dict_keys(['sql', 'df', 'non_opt_resp', 'opt_score', 'opt_response'])

    RFH4_nonoptresponses = run_judge_prompts(user_question, portal_response=payload['non_opt_resp'], df_as_text=df)
    json_R_nonoptresp = get_outputJSON_R(rfh_responses=RFH4_nonoptresponses, model=gpt_parser_model, JudgeOutput_R=JudgeOutput_R)
    json_F_nonoptresp = get_outputJSON_F(rfh_responses=RFH4_nonoptresponses, model=gpt_parser_model, JudgeOutput_F=JudgeOutput_F)
    json_H_nonoptresp = get_outputJSON_H(rfh_responses=RFH4_nonoptresponses, model=gpt_parser_model, JudgeOutput_H=JudgeOutput_H)

    RFH4_optresponses = run_judge_prompts(user_question, portal_response=payload['opt_response'], df_as_text=df)
    json_R_optresp = get_outputJSON_R(rfh_responses=RFH4_optresponses, model=gpt_parser_model, JudgeOutput_R=JudgeOutput_R)
    json_F_optresp = get_outputJSON_F(rfh_responses=RFH4_optresponses, model=gpt_parser_model, JudgeOutput_F=JudgeOutput_F)
    json_H_optresp = get_outputJSON_H(rfh_responses=RFH4_optresponses, model=gpt_parser_model, JudgeOutput_H=JudgeOutput_H)

    return {
        "patient_id": patient_id,
        "user_q": user_q,
        "empathy_opt_score": payload['opt_score'],
        "non_opt_RFH": [json_R_nonoptresp, json_F_nonoptresp, json_H_nonoptresp],
        "opt_RFH": [json_R_optresp, json_F_optresp, json_H_optresp]
    }


In [None]:
def extract_my_dict(dict_):

  d = {
    'patient_id': dict_['patient_id'],
    'user_question' : dict_['user_q'],
    'empathy_opt_score' : dict_['empathy_opt_score'],

    'nonopt_R_score'  : json.loads(dict_['non_opt_RFH'][0])['relevance_score'],
    'nonopt_F_score'  :  json.loads(dict_['non_opt_RFH'][1])['faithfulness_score'],
    'nonopt_H_score'  :  json.loads(dict_['non_opt_RFH'][2])['helpfulness_score'],

    'opt_R_score' :  json.loads(dict_['opt_RFH'][0])['relevance_score'],
    'opt_F_score' :  json.loads(dict_['opt_RFH'][1])['faithfulness_score'],
    'opt_H_score' :  json.loads(dict_['opt_RFH'][2])['helpfulness_score'],

    'nonopt_resp'   :  json.loads(dict_['non_opt_RFH'][0])['portal_response'],
    'opt_resp'   :  json.loads(dict_['opt_RFH'][0])['portal_response']

  }
  return d

# This is the master file that

takes in a patient id, user question,
returns

*   creates a payload of generate sql, returned df, nonoptimized response, and then empathy optimized response
    payload = create_payload(patient_id,user_q, model=LLM_MODEL)
    payload.keys()
    dict_keys(['sql', 'df', 'non_opt_resp', 'opt_score', 'opt_response'])
    
*   then calcuates RFH scores of BOTH non optimized and optimized responses using MedGemma, and uses gpt 4o mini to format outputs in a JSON

**Pipeline functions**

**def get_RFH_evaluation(patient_id, user_q):**


    RFH4_nonoptresponses = run_judge_prompts(user_question, portal_response=payload['non_opt_resp'], df_as_text=df)
    json_R_nonoptresp = get_outputJSON_R(rfh_responses=RFH4_nonoptresponses, model=gpt_parser_model, JudgeOutput_R=JudgeOutput_R)
    json_F_nonoptresp = get_outputJSON_F(rfh_responses=RFH4_nonoptresponses, model=gpt_parser_model, JudgeOutput_F=JudgeOutput_F)
    json_H_nonoptresp = get_outputJSON_H(rfh_responses=RFH4_nonoptresponses, model=gpt_parser_model, JudgeOutput_H=JudgeOutput_H)

    RFH4_optresponses = run_judge_prompts(user_question, portal_response=payload['opt_response'], df_as_text=df)
    json_R_optresp = get_outputJSON_R(rfh_responses=RFH4_optresponses, model=gpt_parser_model, JudgeOutput_R=JudgeOutput_R)
    json_F_optresp = get_outputJSON_F(rfh_responses=RFH4_optresponses, model=gpt_parser_model, JudgeOutput_F=JudgeOutput_F)
    json_H_optresp = get_outputJSON_H(rfh_responses=RFH4_optresponses, model=gpt_parser_model, JudgeOutput_H=JudgeOutput_H)

    return {
        "patient_id": patient_id,
        "user_q": user_q,
        "empathy_opt_score": payload['opt_score'],
        "non_opt_RFH": [json_R_nonoptresp, json_F_nonoptresp, json_H_nonoptresp],
        "opt_RFH": [json_R_optresp, json_F_optresp, json_H_optresp]
    }



**then use extract_my_dict(dict_) to get:**

    {
      'patient_id': '0fca905f-391c-08d3-4b93-b53f69b9da53',
      'user_question': 'which medication am I taking?',
      'empathy_opt_score': 4,
      
      'nonopt_R_score': 4,
      'nonopt_F_score': 5,
      'nonopt_H_score': 2,
      
      'opt_R_score': 5,
      'opt_F_score': 5,
      'opt_H_score': 5,
      
      'nonopt_resp':
          'The table shows medication records for patient ID 0fca905f-391c-08d3-4b93-b53f69b9da53, listing five entries. All medications are oral tablets, with dosages provided for Acetaminophen and amLODIPine, while the dosage for Naproxen sodium is missing. Notably, there are no start or end dates, and the refills field is also empty for all entries. The table appears truncated, showing only 5 rows.',
      
      'opt_resp':
          "I understand that it can be a bit confusing to keep track of your medications, and I'm here to help you with that. Based on your records, you are currently taking the following medications:\n\n1. **Acetaminophen 325 MG Oral Tablet (Tylenol)**\n2. **Acetaminophen 325 MG / OxyCODONE Hydrochloride 5 MG Oral Tablet**\n3. **Amlodipine 2.5 MG Oral Tablet**\n4. **Naproxen Sodium 220 MG Oral Tablet**\n\nIf you have any questions about these medications, such as their purposes or how to take them, please feel free to ask. It's important to feel comfortable and informed about your treatment."

    }




# Bulk Pipeline to LJ evaluate 20 PatienID-Q pairs

Goal:
Take 20 PatientID - Q pairs

Get optimized, non optimized responses
Catalog patient id, user q, empathy score, RFH non opt response, RFH opt resp, non opt resp, opt resp, as JSON records


In [None]:
##First Test the pipeline

patient_id = '0fca905f-391c-08d3-4b93-b53f69b9da53'
user_q = 'which medication am I taking?'
dict_ = get_RFH_evaluation(patient_id, user_q)
all_results = extract_my_dict(dict_)
print(all_results)

--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 5/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


{'patient_id': '0fca905f-391c-08d3-4b93-b53f69b9da53', 'user_question': 'which medication am I taking?', 'empathy_opt_score': 5, 'nonopt_R_score': 3, 'nonopt_F_score': 5, 'nonopt_H_score': 3, 'opt_R_score': 5, 'opt_F_score': 5, 'opt_H_score': 5, 'nonopt_resp': 'The table shows medication records for patient ID 0fca905f-391c-08d3-4b93-b53f69b9da53, listing five entries. All medications are oral tablets, with dosages provided for three of them: Acetaminophen 325 MG, Acetaminophen with oxyCODONE, and amLODIPine, each at a dose of 1.0. Notably, the entries lack start and end dates, as well as refill information, indicating incomplete data. The table appears truncated, showing only 5 rows.', 'opt_resp': "I understand that it can be confusing to keep track of your medications, and I'm here to help you with that. Based on your records, you are currently taking the following medications:\n\n1. **Acetaminophen 325 MG Oral Tablet (Tylenol)**\n2. **Acetaminophen 325 MG / OxyCODONE Hydrochloride 5

In [None]:
print(all_results)

{'patient_id': '0fca905f-391c-08d3-4b93-b53f69b9da53', 'user_question': 'which medication am I taking?', 'empathy_opt_score': 5, 'nonopt_R_score': 3, 'nonopt_F_score': 5, 'nonopt_H_score': 3, 'opt_R_score': 5, 'opt_F_score': 5, 'opt_H_score': 5, 'nonopt_resp': 'The table shows medication records for patient ID 0fca905f-391c-08d3-4b93-b53f69b9da53, listing five entries. All medications are oral tablets, with dosages provided for three of them: Acetaminophen 325 MG, Acetaminophen with oxyCODONE, and amLODIPine, each at a dose of 1.0. Notably, the entries lack start and end dates, as well as refill information, indicating incomplete data. The table appears truncated, showing only 5 rows.', 'opt_resp': "I understand that it can be confusing to keep track of your medications, and I'm here to help you with that. Based on your records, you are currently taking the following medications:\n\n1. **Acetaminophen 325 MG Oral Tablet (Tylenol)**\n2. **Acetaminophen 325 MG / OxyCODONE Hydrochloride 5

In [None]:
##################
##################
# UNIT TEST PIPELINE

patient_queries = [
    "What are my vital signs",
    "What is my earliest vital sign",
    "What are my vitals",
    "Give me my blood pressure readings over time",
    "Show my vital signs history",
    "What has been my weight trend",
    "What has been my maximum weight",
    "What medications am I currently taking?",
    "What conditions have I been diagnosed with?",
    "List all the medical problems on my record.",
    "How screwed up am I?",
    "What are my newest test results?",
    "What tidings bring’st thou of my latest test results",
    "List my active prescriptions",
    "What has been my highest weight",
    "Show my BMI trend",
    "Show my BMI",
    "What have been my weights in kgs over the years",
    "List all my vital signs",
    "Summarize my most recent visit",
    "Which of my visits have lab results"
]

patient_ids = ['8c8e1c9a-b310-43c6-33a7-ad11bad21c40',
 '782001bc-f712-50ae-04f5-9a488f3ef4aa',
 '80e7f50a-3e99-d5ac-cf97-f8a4b4f9e6c7',
 'edc17058-55fb-08c7-12df-ece93a402e50',
 '9f9dbdcb-23a1-82cc-b7bc-e0e420a95bd1',
 'be874504-c868-ebfd-9a77-df6b1e5ff6cc',
 '30e48e16-2df7-207e-7a3d-1650ef0c1ed8',
 '57b21dea-ff00-6c3e-92d9-91c7627f53b2',
 'a3d34c1f-5421-e078-38ec-1498a5941dbe',
 'e83fe1b3-f94f-5591-f851-1da300e24e99',
 'e6705c33-7349-8b12-484d-3b1f93227178',
 '2da86d63-34ae-b887-ddff-8f6f1e6990f1',
 '04181caa-fcc1-c6c8-743e-a903eff368de',
 '20802592-1c31-7339-4c4c-2fe648e1a716',
 '406e8bad-81b5-7624-5b8a-4aeeb74028f5',
 'a331b5bc-cbea-a205-a8bf-dbf3255ef36a',
 '641efcda-7397-4172-c6ac-8231342fa53e',
 'e64918a6-528c-b49e-dff2-3cbe33266342',
 '9c6ef4a8-79e8-92c4-2279-a0666694419b',
 '7757f538-bffe-a8bf-0efb-8363354aab87',
 '5a0fd7a2-6bfd-af1e-7bb6-2060136302c3',
 'eeae0d25-5865-76b4-8ad7-9526bcf3a94d',
 '5dbd017f-d447-9546-8610-8f7bdaa77789',
 '6754b3bf-f5ac-f359-fef6-87cf4b8508ab',
 '0098f2a9-2f4d-4209-778d-cb3426d85987',
 '3270397c-dfa3-6cea-f2ec-be21ade6c52c',
 '3477fa4e-a09a-e779-5d56-eeb00dee758b',
 '74a4cdcf-7cc0-7658-e1e0-cd1182d5f205',
 '0c76b28e-5685-0754-12d1-b1a6b79866f7',
 'f4e9b2c8-9db5-5597-a6a7-1215a638c1e2',
 '8f87d617-a91b-29e0-e155-96a5d71de419',
 '97a046ab-d147-2707-d4cd-cba26c5360ad',
 '8dacd3c2-9e71-7d5d-02aa-7ad9541a0ab9',
 '15c6645a-8f7b-df42-95ec-8b49bda12c10',
 'c8114bff-6bab-8353-597d-4f155f5f1c3e',
 '1c1ab155-7314-095d-1641-06efd2cd0873',
 'f380d818-b685-618e-22dc-b2db2fe0a6c0',
 'c3dae8db-25ee-c40b-c605-600fad411d34',
 '0db7560a-db72-0cef-c59c-1fd6762bc50d',
 '349720c1-0627-e77a-1619-bb11b1530e96',
 '37895f0e-877f-7ea7-aa1b-0b69fcd11385',
 'ef2eaed0-b056-2a9f-7ccb-07a9c9fdabd5',
 '119e46a1-9323-916b-4152-e0daedb48f23',
 'faec5a04-6c56-4296-9fec-4e218e627a32',
 '7cad1f7c-cf61-fd24-254f-d02265160c0a',
 '3f7873ab-0f61-be0c-9af8-f246eec6223a',
 '3ac8c3a5-3c16-699f-537b-e7816347104b',
 '103b63c9-9ef8-6d25-771e-2fba661489a1',
 '13ac6eee-8cf1-e597-1c91-453c8f069a3c',
 '7785daad-accb-cb33-7d8f-2faebf8eb639',
 'fc3e2c0f-6809-7e7b-4ad8-769a732bf13a',
 '750eda4e-3f12-c701-869e-1d392387dfa0',
 '782ada1b-32a4-888a-8812-d8de70d6e5d0',
 '2e2eb927-efd5-bbd4-297d-99071243a8cb',
 'add41d13-8e70-e327-4367-8d945e20f27b',
 'd423f0d1-e7ed-d47e-af4f-20cfd996ac67',
 '033cccaf-bc92-3ddd-b64c-9ea45268a971',
 'add095a2-64e5-aae2-11d2-9be2f89ff843',
 '006c29d1-d868-3a9e-ceab-31f23e398f45',
 '1f0ca842-8c2d-a943-c047-dafce690f5a2',
 '7ddb0322-da41-c9d3-2018-4581109426b2',
 '4faedf9f-2c0e-9800-943a-0930bd08c4c8',
 '80cca49f-29f9-d04f-851d-84b95f863793',
 '583c0740-39e1-9e33-f9d6-4fdb2b815669',
 'b9bacf2f-7027-2e05-fa5b-19167071fdde',
 '3c763653-7fd3-af8a-e65f-79d5bde98d3a',
 'ae05f1fa-7913-f7bc-41bd-2dc8827555e7',
 'e1c6b5c4-34b7-7296-56ed-4c634e93deb9',
 '9cbd97ef-2209-9b1c-b6f7-a23a6c081740',
 '0e5401fd-b241-3c84-066e-2b88e5ddafc7',
 '575cac8f-bed1-32da-30a6-3a516a78500d',
 '3de203ff-a5b9-e99a-c705-3927503e2abf',
 'fd7387f3-3465-7a34-6778-25aac38a13c2',
 '02cb6ae2-d3fd-e497-7077-77cdbeb5f0a1',
 '1e9713a5-742f-aca0-cf95-446338fdc57f',
 '55bc1034-ecf9-d005-5b9e-eac706fe541f',
 'b5193ef4-ab73-ddd3-e7dd-d8168b33e7f6',
 'c420eb5d-97eb-59b6-b247-0ba188408db5',
 '1754bc7d-28cd-4933-fc72-3d9a0d77cf54',
 'b61886a1-b76f-4ecf-b37a-29d0c6aefc26',
 '0851b7fb-87a8-3edc-1e11-8dcb03824dde',
 '25f30c19-e98a-85ea-6de8-f976388d4678',
 '6da68959-d157-b9a3-48bc-1454e5517d6a',
 '0a1bd9a2-fc21-7ad3-3d85-cf31b68eec28',
 'bd6e7acc-7c87-7f0a-5d15-959cf11e22da',
 '31014896-9c27-ae1a-71db-319df60ac5d3',
 '49424eb4-e2ba-40b5-0e2b-2c2d742cce4b',
 '48f06a5e-0d20-3fe6-f5ea-b45bc79e90db',
 '787f9e8e-d3a4-0407-55d1-01a3414fceaf',
 '5c68f376-dd2c-1133-a9cf-f023a5d99078',
 'e5ed5bc3-51e1-a9a7-01fb-f66b8ac4045d',
 'cae42a0d-c36c-8af1-8277-7c9abd011778',
 '155b0e07-d5a9-cc0c-e01a-a982c5d9a8d6',
 '04300771-e00c-e414-830a-66f7ef3584da',
 '45e1243b-470c-efa8-8ce9-f0d50485a846',
 '9f867ec4-9f3a-35af-4bb6-e2c18a603c72',
 'b8ded152-e326-5833-f747-bf9b35c60a76',
 '70775c58-59fb-a3db-9858-1d427567c195',
 'b427e4ea-3a48-207a-bf7d-710f0b574091',
 '0fca905f-391c-08d3-4b93-b53f69b9da53',
 '5fda1015-d0a5-e32d-d0b8-4662e6ce6c2b',
 '43e4a5fe-add4-5581-d0ef-80764c313418',
 '28f107b5-e973-ece3-b762-c2dbd9a01ba8',
 '6f808eef-a811-11eb-3fcb-1ed910d79c4b',
 '77dfae18-8c8c-0ec2-050c-dd93f3ea1cc2',
 '5f9bfe93-062d-ca4b-5389-f8cac604a7e3',
 '2bc26ad6-ad32-0bb0-f964-0fe271fdf054',
 'f49221bb-20fb-45cb-9345-09b6a83ae9de',
 '9ad4a69b-02de-4aeb-2262-76745583a8ac',
 '72b7a6b1-b196-7ba5-eb82-1e9b0f75b7bd',
 '1a9873c2-1d93-e9d6-4e36-77fdb07fbcb2']

In [None]:
##Bulk pipeline exeuction

import random
import time

#random.seed(42)
random.seed(1257)
volume = 15
results = []
start_time = time.time()
for _ in range(volume):
  patient_id = random.choice(patient_ids)
  user_q = random.choice(patient_queries)
  print(f"Processing {patient_id=} \n { user_q = }")
  t1 = time.time()
  try:
       dict_ = get_RFH_evaluation(patient_id, user_q)
       all_results = extract_my_dict(dict_)
       results.append(all_results)
       t2 = time.time()
       duration = t2-t1
       print(f"Processed {patient_id=} \n { user_q = } in { round(duration, 2)} seconds. ")
  except Exception as e:
    print(f"an error occured for {patient_id} {user_q=} \n { str(e)}")
    continue

  end_time_result_processed = time.time()
  processing_time = end_time_result_processed - start_time

  print("-"*100)
  print(f" processed { len(results)} records in { round(processing_time,2)} seconds")
  print("-"*100)


Processing patient_id='3c763653-7fd3-af8a-e65f-79d5bde98d3a' 
  user_q = 'What tidings bring’st thou of my latest test results'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='3c763653-7fd3-af8a-e65f-79d5bde98d3a' 
  user_q = 'What tidings bring’st thou of my latest test results' in 192.72 seconds. 
----------------------------------------------------------------------------------------------------
 processed 1 records in 192.72 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='349720c1-0627-e77a-1619-bb11b1530e96' 
  user_q = 'Show my BMI trend'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='349720c1-0627-e77a-1619-bb11b1530e96' 
  user_q = 'Show my BMI trend' in 172.56 seconds. 
----------------------------------------------------------------------------------------------------
 processed 2 records in 365.28 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='0098f2a9-2f4d-4209-778d-cb3426d85987' 
  user_q = 'Which of my visits have lab results'
an error occured for 0098f2a9-2f4d-4209-778d-cb3426d85987 user_q='Which of my visits have lab results' 
 Blocked: SQL must be a single SELECT without DDL/DML.
Processing patient_id='0851b7fb-87a8-3edc-1e11-8dcb03824dde' 
  user_q = 'What medications am I currently taking?'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 5/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='0851b7fb-87a8-3edc-1e11-8dcb03824dde' 
  user_q = 'What medications am I currently taking?' in 106.81 seconds. 
----------------------------------------------------------------------------------------------------
 processed 3 records in 478.55 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='782ada1b-32a4-888a-8812-d8de70d6e5d0' 
  user_q = 'What has been my highest weight'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 5/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='782ada1b-32a4-888a-8812-d8de70d6e5d0' 
  user_q = 'What has been my highest weight' in 98.04 seconds. 
----------------------------------------------------------------------------------------------------
 processed 4 records in 576.59 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='103b63c9-9ef8-6d25-771e-2fba661489a1' 
  user_q = 'Give me my blood pressure readings over time'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='103b63c9-9ef8-6d25-771e-2fba661489a1' 
  user_q = 'Give me my blood pressure readings over time' in 87.51 seconds. 
----------------------------------------------------------------------------------------------------
 processed 5 records in 664.1 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='77dfae18-8c8c-0ec2-050c-dd93f3ea1cc2' 
  user_q = 'What tidings bring’st thou of my latest test results'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='77dfae18-8c8c-0ec2-050c-dd93f3ea1cc2' 
  user_q = 'What tidings bring’st thou of my latest test results' in 174.81 seconds. 
----------------------------------------------------------------------------------------------------
 processed 6 records in 838.91 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='7cad1f7c-cf61-fd24-254f-d02265160c0a' 
  user_q = 'List my active prescriptions'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='7cad1f7c-cf61-fd24-254f-d02265160c0a' 
  user_q = 'List my active prescriptions' in 133.49 seconds. 
----------------------------------------------------------------------------------------------------
 processed 7 records in 972.4 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='b427e4ea-3a48-207a-bf7d-710f0b574091' 
  user_q = 'What has been my highest weight'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='b427e4ea-3a48-207a-bf7d-710f0b574091' 
  user_q = 'What has been my highest weight' in 67.77 seconds. 
----------------------------------------------------------------------------------------------------
 processed 8 records in 1040.17 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='0851b7fb-87a8-3edc-1e11-8dcb03824dde' 
  user_q = 'Show my BMI trend'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='0851b7fb-87a8-3edc-1e11-8dcb03824dde' 
  user_q = 'Show my BMI trend' in 117.38 seconds. 
----------------------------------------------------------------------------------------------------
 processed 9 records in 1157.55 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='37895f0e-877f-7ea7-aa1b-0b69fcd11385' 
  user_q = 'What are my vital signs'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='37895f0e-877f-7ea7-aa1b-0b69fcd11385' 
  user_q = 'What are my vital signs' in 82.03 seconds. 
----------------------------------------------------------------------------------------------------
 processed 10 records in 1239.58 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='43e4a5fe-add4-5581-d0ef-80764c313418' 
  user_q = 'What is my earliest vital sign'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='43e4a5fe-add4-5581-d0ef-80764c313418' 
  user_q = 'What is my earliest vital sign' in 150.73 seconds. 
----------------------------------------------------------------------------------------------------
 processed 11 records in 1390.31 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='c3dae8db-25ee-c40b-c605-600fad411d34' 
  user_q = 'What has been my weight trend'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='c3dae8db-25ee-c40b-c605-600fad411d34' 
  user_q = 'What has been my weight trend' in 92.42 seconds. 
----------------------------------------------------------------------------------------------------
 processed 12 records in 1482.74 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='5c68f376-dd2c-1133-a9cf-f023a5d99078' 
  user_q = 'What has been my weight trend'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='5c68f376-dd2c-1133-a9cf-f023a5d99078' 
  user_q = 'What has been my weight trend' in 108.35 seconds. 
----------------------------------------------------------------------------------------------------
 processed 13 records in 1591.09 seconds
----------------------------------------------------------------------------------------------------
Processing patient_id='2da86d63-34ae-b887-ddff-8f6f1e6990f1' 
  user_q = 'List all the medical problems on my record.'
--- STARTING OPTIMIZATION (Max Retries: 3) ---

--- GENERATING RESPONSE (Retry: 0) ---

--- EVALUATING RESPONSE ---


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


-> Score: 4/5

*** ROUTE: END (Score > 3) ***


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


Processed patient_id='2da86d63-34ae-b887-ddff-8f6f1e6990f1' 
  user_q = 'List all the medical problems on my record.' in 136.29 seconds. 
----------------------------------------------------------------------------------------------------
 processed 14 records in 1727.39 seconds
----------------------------------------------------------------------------------------------------


In [None]:
import os

DEV_PATH = "/content/drive/MyDrive/210_Capstone/210_Factory/210_dev"
pipeline_results_file = DEV_PATH + "/unit_test_pipeline_results_2.json"

try:
    os.makedirs(DEV_PATH, exist_ok=True)
    print(f"Directory confirmed/created: {DEV_PATH}")
except Exception as e:
    # This catch is for potential issues with mounting or permissions
    print(f"Error creating directory structure: {str(e)}")

try:
  with open(pipeline_results_file, 'w') as f:
    json.dump(results, f, indent=4)
    print(f" Successfully wrote results to: {pipeline_results_file}")
except Exception as e:
  print(f"An error occured while writing the file: \n { str(e)}")

Directory confirmed/created: /content/drive/MyDrive/210_Capstone/210_Factory/210_dev
 Successfully wrote results to: /content/drive/MyDrive/210_Capstone/210_Factory/210_dev/unit_test_pipeline_results_2.json
