In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import tiktoken

In [None]:
# load data
data = pd.read_parquet(
    Path.cwd().parent
    / "data"
    / "raw"
    / "pseudonomised_metavision_data_april.parquet"
)
display(data)

In [None]:
data.columns

In [None]:
# drop all nans in data
data = data.dropna(subset=["code_display_original"])

print(data.code_display_original.sort_values().unique())



grouped = data.groupby("location_Location_value_original")['code_display_original'].unique()
display(grouped)

# print all 
for location, codes in grouped.items():
    print(f"Location: {location}")
    print(f"Codes: {np.sort(codes)}")
    print("\n")

In [None]:
data = pd.read_parquet(
    Path.cwd().parent / "data" / "processed" / "metavision_data_april_dp.parquet"
)
display(data)

In [None]:
data.groupby("department")['enc_id'].nunique()

# random sample three enc_ids
random_sample = data.groupby('department')[['department','enc_id']].sample(5, random_state=46)
display(random_sample)


In [None]:
# filter to only use the random sample of enc_ids
df = data[data.enc_id.isin(random_sample.enc_id)]
display(df)
display(df[df.description == "Ontslagbrief"])

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")
encs_to_remove = []
for dep in data.department.unique():
    print("department:", dep)
    df_temp = data[data.department == dep]
    encs = []
    longer_encs = []
    for enc_id in df_temp["enc_id"].unique():
        dep = df_temp[df_temp["enc_id"] == enc_id].department.unique()
        patient_data_string = " ".join(df_temp[df_temp["enc_id"] == enc_id]["value"])
        print(
            f"The number of tokens in encounter {enc_id} with length of stay: {len(encoding.encode(patient_data_string))}"
        )
        print(f'length of stay was {df_temp[df_temp["enc_id"] == enc_id]["length_of_stay"].unique()}')
        if len(encoding.encode(patient_data_string)) < 128000:
            encs.append(enc_id)
        else:
            longer_encs.append(enc_id)
            encs_to_remove.append(enc_id)
            print(f"Encounter {enc_id} is longer than 120000 tokens")

    print(encs)
    print(longer_encs)
print(encs_to_remove)

In [None]:
# drop the encs from encs_to_remove
data = data[~data["enc_id"].isin(encs_to_remove)]
data.groupby("department")['enc_id'].nunique()
# for april it was only 3 for the IC and 7 for the NICU that were removed

# save back to parquet
data.to_parquet(
    Path.cwd().parent / "data" / "processed" / "metavision_data_april_dp.parquet"
)