# Exploratory analysis of Metavision data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from collections import Counter

import matplotlib.pyplot as plt
import pandas as pd
import tiktoken

os.environ["TIKTOKEN_CACHE_DIR"] = ""

In [None]:
# read data
df = pd.read_parquet("../data/raw/pseudonomised_metavision_data.parquet")

In [None]:
display(df.head())

In [None]:
for column in df.columns:
    if column not in ["period_start", "period_end"]:
        unique_values = df[column].unique()
        num_unique_values = len(unique_values)
        print(f"Column: {column}")
        print(f"Unique Values: {unique_values}")
        print(f"Number of Unique Values: {num_unique_values}")
        print()

In [None]:
# PROCESSING: rename columns
df = df.rename(columns={"location_Location_value_original": "department"})

In [None]:
# verdeling patieten over de afdelingen
print("Totaal aantal datapunten:")
print(df["department"].value_counts())
print("Aantal patienten:")
df.groupby("department")["pseudo_id"].nunique().plot(kind="pie", autopct="%d")
plt.show()

In [None]:
# peusdo_id en enc_id vergelijking
enc_id_pseudo_id_count = df.groupby("enc_id")["pseudo_id"].nunique()
print(enc_id_pseudo_id_count.eq(1).all())
print("so all encounters have a specific patient")
pseudo_id_enc_id_count = df.groupby("pseudo_id")["enc_id"].nunique()
print(pseudo_id_enc_id_count.eq(1).all())
print("so not all patients have a single encounter")
pseudo_id_enc_id_count.plot(kind="hist")
plt.xlabel("number of enc_id per pseudo_id")
plt.show()
print(f"{pseudo_id_enc_id_count.eq(2).sum()} patients have two encounters")

print("patients that have two encounters:")
patients_with_two_encounters = pseudo_id_enc_id_count[
    pseudo_id_enc_id_count.eq(2)
].index
print(patients_with_two_encounters)

In [None]:
# Lengte van opname:
df["period_start"] = pd.to_datetime(df["period_start"])
df["period_end"] = pd.to_datetime(df["period_end"])
df["length_of_stay"] = df["period_end"] - df["period_start"]
df["length_of_stay"] = df["length_of_stay"].dt.days
print(df.groupby("enc_id")["length_of_stay"].nunique().eq(1).all())
print(
    "so all encounters have a single length of stay, which is to be expected as"
    + " an encounter is a single admission"
)
df.groupby(["enc_id"])["length_of_stay"].mean().plot(kind="hist", bins=50)
plt.xlabel("length of stay (days)")
plt.show()


# Plot histograms for each category with different colors
for category in df.department.unique():
    subset = df[df["department"] == category]
    subset.groupby(["enc_id"])["length_of_stay"].mean().plot(
        kind="hist", bins=50, label=category, alpha=0.2
    )

# Set labels and legend
plt.xlabel("Length of Stay (days)")
plt.ylabel("Frequency")
plt.title("Histogram of Length of Stay by Location")
plt.legend()
plt.show()


# Plot histograms for each category with different colors
for category in df.department.unique():
    subset = df[df["department"] == category]
    subset.groupby(["enc_id"])["length_of_stay"].mean().plot(
        kind="hist", bins=50, label=category, alpha=0.4
    )

# Set labels and legend
plt.xlabel("Length of Stay (days)")
plt.ylabel("Frequency")
plt.title("Histogram of Length of Stay by Location - zoomed in")
plt.xlim([0, 20])
plt.legend()
plt.show()

In [None]:
# PROCESSING: remove patients with a length of stay of 0
print(f"before removing encounters with a length of stay of 0:{df.enc_id.nunique()}")
df = df[df["length_of_stay"] != 0]
print(f"after removing encounters with a length of stay of 0:{df.enc_id.nunique()}")

In [None]:
# ValueString column
df.groupby("enc_id")["valueString"].nunique().plot(kind="hist", bins=100)
plt.xlabel("number of unique valueStrings per encounter")
plt.show()

df.groupby("enc_id")["valueString"].apply(lambda x: x.str.len().mean()).plot(
    kind="hist", bins=100
)
plt.xlabel("mean length of valueString per encounter")
plt.show()
df.groupby("enc_id")["valueString"].apply(lambda x: x.str.len().sum()).plot(
    kind="hist", bins=100
)
plt.xlabel("total length of valueString per encounter")
plt.show()

plt.scatter(
    df.groupby("enc_id")["valueString"].apply(lambda x: x.str.len().mean()),
    df.groupby("enc_id")["valueString"].apply(lambda x: x.str.len().sum()),
)
plt.ylabel("Total length of valueString per encounter")
plt.xlabel("Mean length of valueString per encounter")
plt.show()

plt.scatter(
    df.groupby("enc_id")["valueString"].nunique(),
    df.groupby("enc_id")["valueString"].apply(lambda x: x.str.len().sum()),
)
plt.ylabel("Total length of valueString per encounter")
plt.xlabel("Number of unique valueStrings per encounter")
plt.show()

In [None]:
# relate size of valueString to length of stay
plt.scatter(
    df.groupby("enc_id")["length_of_stay"].mean(),
    df.groupby("enc_id")["valueString"].apply(lambda x: x.str.len().sum()),
)
plt.xlabel("Length of stay per encounter")
plt.ylabel("Total length of valueString per encounter")
plt.show()

# alles van 1 dag samen is hoe veel? + samenvatting van 1 dag

In [None]:
# frequentie van ingevulde statussen
frequenties = (
    df.groupby("enc_id")
    .apply(lambda x: x.code_display_original.value_counts() / x.length_of_stay.mean())
    .reset_index()
)
frequenties.groupby("code_display_original")["count"].mean().sort_values(ascending=True)

# neem nieuwste versie van de statussen

In [None]:
# op zoek naar de ontslag brief
ontslag_docu = df[
    df["code_display_original"].isin(
        ["Medische Ontslagbrief - Beloop", "Medische ontslagbrief - Beloop Dictionary"]
    )
]
frequenties = (
    ontslag_docu.groupby("enc_id")["code_display_original"].value_counts().reset_index()
)
print(
    frequenties.groupby("code_display_original")["count"]
    .mean()
    .sort_values(ascending=True)
)
# ik vermoed nu dat het Medische Ontslagbrief - Beloop is
medische_ontslagbrief_beloop = ontslag_docu[
    ontslag_docu["code_display_original"] == "Medische Ontslagbrief - Beloop"
]
for index in medische_ontslagbrief_beloop.index[1:2]:
    row = medische_ontslagbrief_beloop.loc[index]
    # print(row)
    print(row.valueString)

# heeft iedere encounter een ontslag brief?
num_encounters_with_brief = df.groupby("enc_id")["code_display_original"].apply(
    lambda x: "Medische Ontslagbrief - Beloop" in x.values
).sum()
total_encounters = df.enc_id.nunique()
percentage_with_brief = (num_encounters_with_brief / total_encounters) * 100

print(
    f"In {num_encounters_with_brief} van de {total_encounters} patienten "
    f"({percentage_with_brief:.2f}%) was er een ontslagbrief beloop stuk in Metavision"
)
# de patienten met een ontslag brief zijn:
encounters_met_ontslag_brief = df.groupby("enc_id")["code_display_original"].apply(
    lambda x: "Medische Ontslagbrief - Beloop" in x.values
)
encounters_met_ontslag_brief = encounters_met_ontslag_brief[
    encounters_met_ontslag_brief
].index
print("Deze encounters zijn:")
print(encounters_met_ontslag_brief)

In [None]:
# how many tokens on per encounter per day?
df["nr_words"] = df["valueString"].str.split().str.len()
df["nr_characters"] = df["valueString"].str.len()

encoding = tiktoken.get_encoding("cl100k_base")
df["nr_tokens"] = df["valueString"].apply(lambda x: len(encoding.encode(x)))

# Create a new column with the actual encodings
df["encodings"] = df["valueString"].apply(lambda x: encoding.encode(x))

df["date"] = df["period_start"].dt.date
# display(df)

df.groupby(["enc_id", "date"])["nr_words"].sum().plot(kind="hist", bins=50)
plt.xlabel("number of words per encounter per day")
plt.axvline(
    df.groupby(["enc_id", "date"])["nr_words"].sum().mean(),
    color="k",
    linestyle="dashed",
    linewidth=1,
)
plt.show()


df.groupby(["enc_id", "date"])["nr_characters"].sum().plot(kind="hist", bins=50)
plt.xlabel("number of characters per encounter per day")
plt.axvline(
    df.groupby(["enc_id", "date"])["nr_characters"].sum().mean(),
    color="k",
    linestyle="dashed",
    linewidth=1,
)
token_limit = 4096
plt.axvline(token_limit, color="r", linestyle="dashed", linewidth=1)
plt.text(token_limit, -5, "token limit 4096", rotation=90)
token_limit = 16384
plt.axvline(token_limit, color="r", linestyle="dashed", linewidth=1)
plt.text(token_limit, -5, "token limit 16384", rotation=90)
plt.show()

df.groupby(["enc_id", "date"])["nr_tokens"].sum().plot(kind="hist", bins=50)
plt.xlabel("number of tokens per encounter per day")
plt.axvline(
    df.groupby(["enc_id", "date"])["nr_tokens"].sum().mean(),
    color="k",
    linestyle="dashed",
    linewidth=1,
)
token_limit = 4096
plt.axvline(token_limit, color="r", linestyle="dashed", linewidth=1)
plt.text(token_limit, -5, "token limit 4096", rotation=90)
token_limit = 16384
plt.axvline(token_limit, color="r", linestyle="dashed", linewidth=1)
plt.text(token_limit, -5, "token limit 16384", rotation=90)
plt.show()

In [None]:
# PROCESSING keep only latest input for each code_display_original per effective_date
print(f"before removing duplicates on the same day: {df.shape}")

groupby_end_id = df.groupby(["enc_id", "code_display_original"]).mean()
print(groupby_end_id)
print(groupby_end_id.groupby("code_display_original").mean())
df.sort_values(by=["enc_id", "date"], inplace=True)
df.drop_duplicates(
    subset=["enc_id", "date", "code_display_original"], keep="last", inplace=True
)
df.groupby("enc_id").apply(lambda x: Counter(x.code_display_original))
print(f"after removing duplicates on the same day: {df.shape}")

In [None]:
df.groupby(["enc_id", "date"])["nr_characters"].sum().plot(kind="hist", bins=50)
plt.xlabel("number of characters per encounter per day")
plt.axvline(
    df.groupby(["enc_id", "date"])["nr_characters"].sum().mean(),
    color="k",
    linestyle="dashed",
    linewidth=1,
)
token_limit = 4096
plt.axvline(token_limit, color="r", linestyle="dashed", linewidth=1)
plt.text(token_limit, -5, "token limit 4096", rotation=90)
token_limit = 16384
plt.axvline(token_limit, color="r", linestyle="dashed", linewidth=1)
plt.text(token_limit, -5, "token limit 16384", rotation=90)
plt.show()

df.groupby(["enc_id", "date"])["nr_tokens"].sum().plot(kind="hist", bins=50)
plt.xlabel("number of tokens per encounter per day")
plt.axvline(
    df.groupby(["enc_id", "date"])["nr_tokens"].sum().mean(),
    color="k",
    linestyle="dashed",
    linewidth=1,
)
token_limit = 4096
plt.axvline(token_limit, color="r", linestyle="dashed", linewidth=1)
plt.text(token_limit, -5, "token limit 4096", rotation=90)
token_limit = 16384
plt.axvline(token_limit, color="r", linestyle="dashed", linewidth=1)
plt.text(token_limit, -5, "token limit 16384", rotation=90)
plt.show()

In [None]:
display(df)
print(df.pseudo_id.unique())