In [None]:
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import tiktoken

from discharge_docs.prompts.prompt import (
    load_prompts,
    load_template_prompt,
)

os.environ["TIKTOKEN_CACHE_DIR"] = ""

pd.options.display.max_colwidth = 1000000

In [None]:
# get current file path
current_path = Path.cwd().parent
relative_path = current_path / "data" / "processed" / "metavision_data_april_dp.parquet"
df_discharge = pd.read_parquet(relative_path)

In [None]:
# load prompts:
user_prompt, system_prompt = load_prompts()
template_prompt_NICU = load_template_prompt("NICU")
template_prompt_IC = load_template_prompt("IC")
template_prompt_CAR = load_template_prompt("CAR")
template_prompt_PSY = load_template_prompt("PSY")
template_prompt_dict = {
    "nicu": template_prompt_NICU,
    "ic": template_prompt_IC,
    "car": template_prompt_CAR,
    "psy": template_prompt_PSY,
    "demo": template_prompt_NICU,
}

In [None]:
df_discharge["department"].value_counts()

In [None]:
df_discharge.head()

In [None]:
# voeg token lenth toe:
encoding = tiktoken.get_encoding("cl100k_base")
encs = []
longer_encs = []
for enc_id in np.sort(df_discharge["enc_id"].unique()):
    patient_data_string = " ".join(
        df_discharge[df_discharge["enc_id"] == enc_id]["value"]
    )
    # print(
    #     f"The number of tokens in encounter {enc_id}:
    #  {len(encoding.encode(patient_data_string))} "+
    #     f"with length of stay: {df_discharge[df_discharge.enc_id == enc_id]
    # .length_of_stay.unique()}"
    # )
    template_prompt_length = len(
        encoding.encode(user_prompt + system_prompt + template_prompt_NICU)
    )
    if (len(encoding.encode(patient_data_string)) + template_prompt_length) < 110000:
        encs.append(enc_id)
    else:
        longer_encs.append(enc_id)

# print(encs)
print(longer_encs)  # remove these from batching

Remove the enc_ids that are too long for GPT

Hardcoded such that the assigment remains reproducible


In [None]:
longer_ids_hardcoded = [0, 1, 2, 3, 5, 6, 21]

In [None]:
df_discharge["enc_id"].nunique()

In [None]:
df_discharge = df_discharge[~df_discharge["enc_id"].isin(longer_ids_hardcoded)]

In [None]:
df_discharge["enc_id"].nunique()

In [None]:
df_discharge_filtered = df_discharge[df_discharge["description"] == "Ontslagbrief"]

In [None]:
df_discharge_filtered[df_discharge_filtered["enc_id"] == 11][["enc_id", "value"]]

In [None]:
df_IC = df_discharge_filtered[
    df_discharge_filtered["department"] == "Intensive Care Centrum"
]

In [None]:
df_NICU = df_discharge_filtered[df_discharge_filtered["department"] == "Neonatologie"]

### NICU

The NICU discharge letters contain some incomplete letters, so we will filter them out

In [None]:
fist_4_ids_nicu = [201, 41, 244]

In [None]:
exclusion_strings = ["..", "G-schijf", "G-", "G schijf"]

potential_nicu_letters = []
for _, row in df_NICU.iterrows():
    if not any(substring in row["value"] for substring in exclusion_strings):
        potential_nicu_letters.append(row["enc_id"])

In [None]:
len(potential_nicu_letters)

In [None]:
df_NICU[df_NICU["enc_id"] == 38][["enc_id", "value"]]

### IC

IC looks good. Can use random samples.

In [None]:
fist_4_ids_ic = [373, 304, 437, 24]

In [None]:
df_IC[df_IC["enc_id"] == 24][["enc_id", "value"]]

In [None]:
potential_ic_letters = list(df_IC.enc_id.unique())

### Batching

**Eerste idee voor het sampelen van de ontslagbrieven:**

We nemen een sample van 100 EPD/ontslagbrieven combinaties, 1 willekeurige, 33 van elke afdeling. Uniform verdeeld qua lengte tussen het kortste dossier en het langst acceptabele dossier voor GPT. 1 willekeurige wordt van tevoren door een arts geannoteerd, te gebruiken bij de kickoff. 6 (2 van elke afdeling) worden gebruikt voor de burn-in en dus gescoord door alle 3 de studenten en geëvalueerd tijdens de tweede sessie. De overige 93 (31 per afdeling) willen we verdelen over batches zodat mochten we niet tot de 100 komen in totaal we toch voldoende overlap tussen de studenten hebben en de studenten zowel de GPT als art brief die bij een opname hoort hebben gezien.

Per batch van 30: 6 overlappende (2 elke afdeling) en 3x8 unieke per student. Deze in een random volgorde gezet over de GPT en arts brieven, dus per batch 2*(6+8) = 28 te annoteren brieven per student in willekeurige volgorde. de laatste brief kan er bij een willekeurige student bij komen. Deze batches kunnen allemaal van tevoren worden gegenereerd zodat de studenten niet hoeven te wachten na een batch, maar zodat er wel voldoende overlap blijft bestaan en zowel de GPT en arts brief door dezelfde student zijn beoordeeld binnen een batch.

**Aangepaste versie**

Doordat er wat dingen anders zijn gelopen dan in het initiele plan, is de opzet van het batchen ook wat veranderd. Het idee is dat de gewenste overlap gelijk blijft: 20%. We hebben voor de kickoff patient een nep patient gebruikt, dus deze hoefde niet gexcludeerd te worden. We hebben in eerste instantie 3 IDS van de NICU voorgelegd en 4 van de IC. Deze hebben we voor het maken van de volgende batches eruit gefilterd. Voor het samplen gebruiken we ook niet de token length van het dossier, maar de LoS als proxy. Dit zorgt ervoor dat we niet met bins hoeven werken.

In de nieuwe methode worden eerst dossiers uniform over de LoS gesampled en deze daarna over de studenten verdeeld, dusdanig dat ~elke vijfde brief van elke afdeling overlapt.

In [None]:
remaining_nicu_letters = set(potential_nicu_letters) - set(fist_4_ids_nicu)
remaining_ic_letters = set(potential_ic_letters) - set(fist_4_ids_ic)
print(f"Length remaining nicu letters: {len(remaining_nicu_letters)}")
print(f"Length remaining ic letters: {len(remaining_ic_letters)}")

In [None]:
df_NICU_processed = df_NICU[df_NICU["enc_id"].isin(remaining_nicu_letters)][
    ["enc_id", "length_of_stay"]
]
df_IC_processed = df_IC[df_IC["enc_id"].isin(remaining_ic_letters)][
    ["enc_id", "length_of_stay"]
]

#### Sampling function

In [None]:
def create_ordered_id_list(df_los):
    """Return the list of patient IDs for that has been sampled uniformly based on LoS

    Parameters
    ----------
    df_los : pd.DataFrame
        dataframe containing enc_id and
    """

    # Create a list of NICU letters in which order they are to be divided among the
    # students
    # First, from the dataframe with enc_id and length of stay, create a dict for which
    # the
    # keys are the length and the value a list of enc_ids with that length
    # shuffle the list of enc_ids for each length

    length_enc_id_dict = {}
    for _, row in df_los.iterrows():
        length = row["length_of_stay"]
        if length not in length_enc_id_dict:
            length_enc_id_dict[length] = []
        length_enc_id_dict[length].append(row["enc_id"])

    for key in length_enc_id_dict:
        random.shuffle(length_enc_id_dict[key])

    # Now randomly sample one of the keys in the length_enc_id_dict and pop the first
    # element. If the list is empty, remove the key from the dict. Add the popped
    # element
    # to the ordered_id list. Repeat until the dict is empty

    ordered_ids = []
    while length_enc_id_dict:
        length = random.choice(list(length_enc_id_dict.keys()))
        ordered_ids.append(length_enc_id_dict[length].pop(0))
        if not length_enc_id_dict[length]:
            length_enc_id_dict.pop(length)

    return ordered_ids

In [None]:
# Take a list with students, and a dict with departments and ids
# divide the ids among the students such that every overlap_rounds all students get the
#  same
# id for the department


def divide_ids_among_students(students, department_id_dict, overlap_rounds=5):
    """Take a list with students, and a dict with departments and ids
    divide the ids among the students such that every overlap_rounds all
    students get the same id for the department.

    Parameters
    ----------
    students : list
        list of students
    department_id_dict : dict
        dict with department as key and list of ids as value

    Returns
    -------
    dict
        dict with student as key and list of ids as value
    """

    student_id_dict = {}
    student_dept_dict = {}
    for student in students:
        student_id_dict[student] = []
        student_dept_dict[student] = []

    # For determining when an overlapping id needs to be assigned
    department_overlap_dict = {}
    for dep in department_id_dict:
        department_overlap_dict[dep] = overlap_rounds

    empty_dep = False
    while not empty_dep:

        for id in department_id_dict:
            if department_overlap_dict[id] // overlap_rounds > 0:

                id_to_add = department_id_dict[id].pop(0)
                department_overlap_dict[id] -= overlap_rounds

                # stop when one of the departments has no more ids to assign
                if not department_id_dict[id]:
                    empty_dep = True

                for student in students:
                    student_id_dict[student].append(id_to_add)
                    student_dept_dict[student].append(id)
            else:
                for student in students:

                    id_to_add = department_id_dict[id].pop(0)
                    department_overlap_dict[id] += 1

                    # stop when one of the departments has no more ids to assign
                    if not department_id_dict[id]:
                        empty_dep = True
                        break

                    student_id_dict[student].append(id_to_add)
                    student_dept_dict[student].append(id)

    return student_id_dict, student_dept_dict

In [None]:
# To ensure consistency
random.seed(1000)

nicu_ordered = create_ordered_id_list(df_NICU_processed)
ic_ordered = create_ordered_id_list(df_IC_processed)
department_dict = {"NICU": nicu_ordered, "IC": ic_ordered}
students = ["student_1", "student_2"]

id_assignment, dpt_assignment = divide_ids_among_students(students, department_dict, 5)

In [None]:
ids_student_1 = id_assignment["student_1"]
dpt_student_1 = dpt_assignment["student_1"]
print(ids_student_1)
print(dpt_student_1)
ids_student_2 = id_assignment["student_2"]
dpt_student_2 = dpt_assignment["student_2"]
print(ids_student_2)
print(dpt_student_2)