# Exploratory analysis of the new Metavision export

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import tiktoken

from discharge_docs.processing.processing import get_patient_file

os.environ["TIKTOKEN_CACHE_DIR"] = ""
os.environ["TIKTOKEN_CACHE_DIR"] = ""

In [None]:
# read data
df = pd.read_parquet("../data/processed/metavision_new_data.parquet")

In [None]:
display(df.head())

In [None]:
df.groupby("department")['enc_id'].nunique()

In [None]:
# verdeling patieten over de afdelingen
print("Totaal aantal datapunten:")
print(df["department"].value_counts())
print("Aantal patienten:")
df.groupby("department")["enc_id"].nunique().plot(kind="pie", autopct="%d")
plt.ylabel("")
plt.title("Aantal patienten per afdeling in de data")
plt.show()

In [None]:
# Lengte van opname:
df.groupby(["enc_id"])["length_of_stay"].mean().plot(kind="hist", bins=50)
plt.xlabel("length of stay (days)")
plt.show()

# print averages per department
print("Gemiddelde leeftijd per afdeling:")
print(df.groupby("department")["length_of_stay"].mean())

In [None]:
def get_token_length(enc_id):
    encoding = tiktoken.get_encoding("cl100k_base")
    patient_data_string, patient_file = get_patient_file(df, enc_id)
    department = patient_file.department.values[0]
    token_length = len(encoding.encode(patient_data_string))
    return enc_id, department, token_length

results = []
for enc_id in df.enc_id.unique():
    result = get_token_length(enc_id)
    results.append(result)

df_results = pd.DataFrame(results, columns=['enc_id', 'department', 'token_length'])
display(df_results)

print(df_results.groupby("department")["token_length"].mean())
print(df_results.groupby("department")["token_length"].max())

In [None]:

encoding = tiktoken.get_encoding("cl100k_base")


file_sizes = []
length_of_stay = []
for enc_id in df["enc_id"].unique():
    patient_data_string, patient_file = get_patient_file(df, enc_id)
    file_sizes.append(len(encoding.encode(patient_data_string)))
    length_of_stay.append(df[df["enc_id"] == enc_id]["length_of_stay"].iloc[0])

In [None]:
plt.hist(file_sizes, bins=50)
plt.show()

In [None]:
plt.scatter(file_sizes, length_of_stay)
plt.xlim(16000, 200000)
plt.ylim(0, 20)