# GPT Evaluation

This notebook attempts to evaluate the performance of a prompt using GPT

In [None]:
import json
import os
import re
from pathlib import Path

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from openai import AzureOpenAI

from discharge_docs.processing.processing import (
    get_patient_discharge_docs,
    get_patient_file,
)
from discharge_docs.prompts.prompt import (
    load_evaluatie_prompt,
    load_prompts,
    load_template_prompt,
)
from discharge_docs.prompts.prompt_builder import PromptBuilder

current_dir = os.getcwd()


# Enables automatic reloading of (locally installed) packages
%load_ext autoreload
%autoreload 2

In [None]:
# initialise Azure
load_dotenv()

deployment_name = "aiva-gpt"
TEMPERATURE = 0.2

client = AzureOpenAI(
    api_version="2024-02-01",
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
)

In [None]:
# load data
df_metavision = pd.read_parquet(
    Path(current_dir).parent / "data" / "processed" / "metavision_new_data.parquet"
)

patient_1_NICU = df_metavision[df_metavision["enc_id"] == 107]
patient_2_NICU = df_metavision[df_metavision["enc_id"] == 20]
patient_3_NICU = df_metavision[df_metavision["enc_id"] == 150]

patient_1_IC = df_metavision[df_metavision["enc_id"] == 48]
patient_2_IC = df_metavision[df_metavision["enc_id"] == 55]
patient_3_IC = df_metavision[df_metavision["enc_id"] == 63]

df_HIX = pd.read_parquet(
    Path(current_dir).parent / "data" / "processed" / "HiX_data.parquet"
)
patient_1_CAR = df_HIX[df_HIX["enc_id"] == 1012]
patient_2_CAR = df_HIX[df_HIX["enc_id"] == 1010]
patient_3_CAR = df_HIX[df_HIX["enc_id"] == 1062]

patient_1_PSY = df_HIX[df_HIX["enc_id"] == 1142]

patient_1_demo = pd.read_csv(
    Path(current_dir).parent / "data" / "processed" / "DEMO_patient_1.csv", sep=";"
)
patient_1_demo["date"] = pd.to_datetime(patient_1_demo["date"])

data_dict = {
    "patient_1_nicu": patient_1_NICU,
    "patient_2_nicu": patient_2_NICU,
    "patient_3_nicu": patient_3_NICU,
    "patient_1_ic": patient_1_IC,
    "patient_2_ic": patient_2_IC,
    "patient_3_ic": patient_3_IC,
    "patient_1_car": patient_1_CAR,
    "patient_2_car": patient_2_CAR,
    "patient_3_car": patient_3_CAR,
    "patient_1_psy": patient_1_PSY,
    "patient_1_demo": patient_1_demo,
}

# load prompts
user_prompt, system_prompt = load_prompts()
template_prompt_NICU = load_template_prompt("NICU")
template_prompt_IC = load_template_prompt("IC")
template_prompt_CAR = load_template_prompt("CAR")
template_prompt_PSY = load_template_prompt("PSY")
template_prompt_dict = {
    "nicu": template_prompt_NICU,
    "ic": template_prompt_IC,
    "car": template_prompt_CAR,
    "psy": template_prompt_PSY,
    "demo": template_prompt_NICU,
}

In [None]:
data = data_dict["patient_1_ic"]
patient_file_string, _ = get_patient_file(df=data)
template_prompt = template_prompt_dict["ic"]
prompt_builder = PromptBuilder(
    temperature=TEMPERATURE, deployment_name=deployment_name, client=client
)

generated_doc = prompt_builder.generate_discharge_doc(
    patient_file=patient_file_string,
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    template_prompt=template_prompt,
)

GPT_letter = [f"{x['Categorie']}: {x['Beloop tijdens opname']}" for x in generated_doc]
GPT_letter = "\n\n".join(GPT_letter)
print(GPT_letter)

In [None]:
# OG letter
OG_letter = get_patient_discharge_docs(df=data).values[0]
print(OG_letter)

In [None]:
def compare_GPT_output_with_EPD_output(
    GPT_output, EPD_output, evaluatie_prompt, engine, temperature
):
    messages = [
        # {
        #     "role": "system",
        #     "content": evaluatie_system_prompt,
        # },
        {"role": "user", "content": evaluatie_prompt},
        {"role": "user", "content": "Samenvatting A: " + EPD_output},  # samenvatting A
        {"role": "user", "content": "Samenvatting B: " + GPT_output},  # samenvatting B
    ]
    response = client.chat.completions.create(
        model=engine,
        messages=messages,
        temperature=temperature,
    )
    reply = json.loads(
        re.sub(
            "```(json)?",
            "",
            response.model_dump()["choices"][0]["message"]["content"],
        )
    )
    return reply

In [None]:
# evaluate the performance:

evaluatie_prompt = load_evaluatie_prompt()
print(evaluatie_prompt)

In [None]:
eval_output = {
    "Semantische Similariteit Score": [],
    "Volledigheid Percentage van EPD brief": [],
    "Volledigheid Percentage van GPT brief": [],
    "Overlap Percentage": [],
}

n_runs = 20
for _i in range(n_runs):
    eval = compare_GPT_output_with_EPD_output(
        GPT_letter, OG_letter, evaluatie_prompt, engine=deployment_name, temperature=0
    )
    eval_output["Semantische Similariteit Score"].append(
        eval["Semantische Similariteit Score"]
    )
    eval_output["Volledigheid Percentage van EPD brief"].append(
        eval["Volledigheid Percentage van A"]
    )
    eval_output["Volledigheid Percentage van GPT brief"].append(
        eval["Volledigheid Percentage van B"]
    )
    eval_output["Overlap Percentage"].append(eval["Overlap Percentage"])

print(eval_output)
# take averages over eval_output
print("Average scores:")

average_eval_output = {
    key: np.mean(value) for key, value in eval_output.items()
}
print(average_eval_output)

In [None]:
# # get data

# data = load_and_process_data_metavision()
# enc_ids_outside_limit = []
# size_outside_limit = []
# enc_ids_within_limit = []
# size_within_limit = []
# for enc_id in data.enc_id.unique():
#     # enc_id = 6
#     patient_data_string, patient_data_df = get_patient_file(enc_id, data)
#     # print(patient_data)
#     # check the length of the patient data
#     encoding = tiktoken.get_encoding("cl100k_base")
#     # print(f"The number of tokens in the data: {len(encoding.encode(patient_data_string))}")
#     # print(f"The length of stay is: {np.mean(patient_data_df['length_of_stay'])}")
#     if len(encoding.encode(patient_data_string))<16000:
#         enc_ids_within_limit.append(enc_id)
#         size_within_limit.append(len(encoding.encode(patient_data_string)))
#     else:
#         enc_ids_outside_limit.append(enc_id)
#         size_outside_limit.append(len(encoding.encode(patient_data_string)))
# print(f"The number of patients within the limit is: {len(enc_ids_within_limit)}") # 176
# print(enc_ids_within_limit)
"""[20, 38, 39, 42, 48, 46, 55, 63, 67, 69, 68, 71, 75, 83, 91, 92, 93, 95, 99, 110, 
107, 111, 115, 116, 119, 120, 121, 126, 128, 129, 133, 138, 150, 149, 152, 165, 167, 
171, 168, 169, 175, 185, 187, 193, 200, 202, 211, 216, 217, 219, 223, 227, 228, 231, 
234, 235, 241, 243, 264, 273, 278, 279, 284, 286, 288, 292, 306, 304, 305, 310, 309,
 317, 319, 324, 327, 326, 336, 338, 341, 343, 345, 354, 355, 357, 364, 363, 370, 374, 
 377, 381, 384, 396, 404, 405, 408, 413, 417, 420, 422, 423, 424, 425, 428, 430, 431, 
 437, 443, 439, 442, 458, 454, 463, 465, 475, 477, 488, 482, 491, 487, 490, 494, 499, 
 498, 497, 501, 510, 512, 518, 519, 522, 524, 530, 539, 540, 547, 544, 554, 557, 571, 
 569, 577, 583, 584, 591, 601, 609, 610, 613, 614, 615, 616, 624, 621, 631, 639, 648, 
 651, 662, 671, 677, 680, 682, 685, 690, 695, 697, 700, 703, 716, 714, 715, 723, 724,
   725, 729, 730]
"""
# print(f"The number of patients outside the limit is: {len(enc_ids_outside_limit)}") # 143
# gepseudonimiseerd & af: (subset)
NICU = [20, 107, 116, 129, 150]
IC = [48, 55, 63, 67, 69, 68, 71]