In [None]:
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import tiktoken
import tomli
import tomli_w

from discharge_docs.dashboard.batching_helper import (
    create_ordered_id_list,
    divide_ids_among_students,
)
from discharge_docs.prompts.prompt import (
    load_prompts,
    load_template_prompt,
)

os.environ["TIKTOKEN_CACHE_DIR"] = ""

pd.options.display.max_colwidth = 1000000


In [None]:
path = (
    Path.cwd().parent.parent
    / "data"
    / "processed"
    / "pre-pilot"
)
data = pd.read_parquet(path / "HiX_CAR_data_pre_pilot_may.parquet")
display(data.sort_values("dischargeDate"))




In [None]:

# load prompts:
user_prompt, system_prompt = load_prompts()
template_prompt_NICU = load_template_prompt("NICU")
template_prompt_IC = load_template_prompt("IC")
template_prompt_CAR = load_template_prompt("CAR")
template_prompt_PSY = load_template_prompt("PSY")
template_prompt_dict = {
    "nicu": template_prompt_NICU,
    "ic": template_prompt_IC,
    "car": template_prompt_CAR,
    "psy": template_prompt_PSY,
    "demo": template_prompt_NICU,
}
data["department"].value_counts()
data.head()
# voeg token lenth toe:
encoding = tiktoken.get_encoding("cl100k_base")
encs = []
longer_encs = []
for enc_id in np.sort(data["enc_id"].unique()):
    patient_data_string = " ".join(
        data[data["enc_id"] == enc_id]["value"]
    )
    template_prompt_length = len(
        encoding.encode(user_prompt + system_prompt + template_prompt_CAR)
    )
    if (len(encoding.encode(patient_data_string)) + template_prompt_length) < 110000:
        encs.append(enc_id)
    else:
        longer_encs.append(enc_id)

print('The following encounters are too long and should be removed:')
print(longer_encs)
print('The following encounters are short enough:')
print(encs)

# remove
data = data[data["enc_id"].isin(encs)]
print('removal done')


In [None]:
# remove by hand a few that have a line saying it is not complete
to_remove_enc = [1018, 1050, 1065]
data = data[~data["enc_id"].isin(to_remove_enc)]
print(f'removal done of the {len(to_remove_enc)} encounters')
print(f'remainig encounters: {len(data.enc_id.unique())}')

In [None]:
data_processed = data.copy()
data_processed[["enc_id", "length_of_stay"]]
data_processed.head()

In [None]:
# To ensure consistency
random.seed(1000)

data_ordered = create_ordered_id_list(data_processed)
department_dict = {"CAR": data_ordered}
students = ["student_1", "student_2"]

id_assignment, dpt_assignment = divide_ids_among_students(students, department_dict, 5)

In [None]:
ids_student_1 = id_assignment["student_1"]
dpt_student_1 = dpt_assignment["student_1"]
print(ids_student_1)
print(dpt_student_1)
ids_student_2 = id_assignment["student_2"]
dpt_student_2 = dpt_assignment["student_2"]
print(ids_student_2)
print(dpt_student_2)

In [None]:
# filter to show only the first 40 per student
n = 40
ids_student_1 = ids_student_1[:n]
ids_student_2 = ids_student_2[:n]
dpt_student_1 = dpt_student_1[:n]
dpt_student_2 = dpt_student_2[:n]


In [None]:
# save to toml file fase 1
with open(
    Path.cwd().parents[1]
    / "src"
    / "discharge_docs"
    / "dashboard"
    / "enc_ids_pre_release_phase1_1.toml",
    "rb",
) as f:
    data = tomli.load(f)

# Example usage

# Modify the data structure (this is just an example)
# You can customize this part based on how you want to update the `ids` and `department`

# Replace this with actual modification logic
data["student_1"]["ids"] = ids_student_1
data["student_1"]["department"] = dpt_student_1

data["student_2"]["ids"] = ids_student_2
data["student_2"]["department"] = dpt_student_2

with open(
    Path.cwd().parents[1]
    / "src"
    / "discharge_docs"
    / "dashboard"
    / "enc_ids_pre_release_phase1_1.toml",
    "wb",
) as f:
    tomli_w.dump(data, f)

In [None]:
# save to toml file fase 2

with open(
    Path.cwd().parents[1]
    / "src"
    / "discharge_docs"
    / "dashboard"
    / "enc_ids_pre_release_phase2.toml",
    "rb",
) as f:
    data = tomli.load(f)

ids_student_1.extend(ids_student_2)
ids_phase_2 = list(set(ids_student_1))

data["CAR"]["ids"] = ids_phase_2
# Remove the NICU and IC sections from the data
if "NICU" in data:
    del data["NICU"]
if "IC" in data:
    del data["IC"]

with open(
    Path.cwd().parents[1]
    / "src"
    / "discharge_docs"
    / "dashboard"
    / "enc_ids_pre_release_phase2.toml",
    "wb",
) as f:
    tomli_w.dump(data, f)
