In [None]:
from pathlib import Path

import pandas as pd

%load_ext autoreload
%autoreload 2

In [None]:
data_folder = Path().cwd().parent / "data"

In [None]:
# load metavision data
df_metavision_new = pd.read_parquet(
    data_folder / "raw" / "pseudonomised_new_metavision_data.parquet"
)

# include enc_id per admission
df_metavision_new["enc_id"] = df_metavision_new.groupby(
    ["AddmissionDate", "DischargeDate"]
).ngroup()

# Rename and drop columns
df_metavision_new = df_metavision_new.drop(
    columns=["ParameterID", "ValidationTime"]
).rename(
    columns={
        "AddmissionDate": "admissionDate",
        "DischargeDate": "dischargeDate",
        "Name": "department",
        "pseudo_id": "pseudo_id",
        "Time": "time",
        "Abbreviation": "description",
        "Value": "value",
        "enc_id": "enc_id",
    }
)
display(df_metavision_new)


In [None]:
df_HiX_discharge = pd.read_parquet(
    data_folder / "raw" / "pseudonomised_HiX_discharge_data_2.parquet"
)
display(df_HiX_discharge)

In [None]:
# obtain only the specialty_Organization_value NEO
df_HiX_discharge_NEO = df_HiX_discharge[
    df_HiX_discharge["specialty_Organization_value"] == "NEO"
]
# drop duplicates in enc_id keep last one with created
df_HiX_discharge_NEO = df_HiX_discharge_NEO.sort_values(by="created").drop_duplicates(
    subset="enc_id", keep="last"
)
df_HiX_discharge_NEO = df_HiX_discharge_NEO.drop(
    columns=["status", "enc2_id", "location_Location_value", "docStatus"]
).rename(
    columns={
        "period_start": "admissionDate",
        "period_end": "dischargeDate",
        "specialty_Organization_value": "department",
        "pseudo_id": "pseudo_id",
        "created": "time",
        "description": "description",
        "content_attachment1_plain_data": "value",
        "enc_id": "enc_id",
    }
)

# replace NEO with Neonatologie in department
df_HiX_discharge_NEO["department"] = df_HiX_discharge_NEO["department"].replace(
    "NEO", "Neonatologie"
)
# replace Ontslagbericht with Volledig Ontslagbericht in description
df_HiX_discharge_NEO["description"] = df_HiX_discharge_NEO["description"].replace(
    "Ontslagbericht", "Volledig Ontslagbericht")
    
# convert pseudo id to non capital letters
df_HiX_discharge_NEO["pseudo_id"] = df_HiX_discharge_NEO["pseudo_id"].str.lower()

display(df_HiX_discharge_NEO)

In [None]:
# get rows where unique combination of pseudo_id, admissionDate and dischargeDate from df_HiX_discharge_NEO are in metavision data
merged_df = pd.merge(
    df_HiX_discharge_NEO,
    df_metavision_new,
    how="inner",
    on=["pseudo_id"],
) 


display(merged_df)

# # merge metavision and HiX discharge data NEO on pseudo_id and admissionDate
# df_metavision_merged = pd.merge(
#     df_metavision_new,
#     df_HiX_discharge_NEO,
#     how="outer",
#     # on=["admissionDate", "dischargeDate"],
# )

# display(df_metavision_merged)

In [None]:
display(merged_df[merged_df["pseudo_id"] == merged_df["pseudo_id"][74250]])

In [None]:
pseudo_list_metavision = "/mapr/administratielast/administratielast_datamanager/ontslagdocumentatie/new_metavision_pseudo_table.csv"
pseudo_list_dp = "/mapr/administratielast/administratielast_datamanager/ontslagdocumentatie/HiX_patient_files_2pseudo_table.csv"

# read pseudolists
df_pseudo_list_metavision = pd.read_csv(pseudo_list_metavision)
df_pseudo_list_dp = pd.read_csv(pseudo_list_dp)
display(df_pseudo_list_metavision)
display(df_pseudo_list_dp)

# get the subject_Patient_value that are in both
merges_pseudolist = pd.merge(
    df_pseudo_list_metavision,
    df_pseudo_list_dp,
    how="inner",
    on=["subject_Patient_value"],
)

display(merges_pseudolist)