# Use DEDUCE to pseudonomise text

In [None]:
from hashlib import sha256
from pathlib import Path

import deduce
import pandas as pd
from IPython.display import display
from tqdm.notebook import tqdm

tqdm.pandas()

In [None]:
def apply_deduce(df: pd.DataFrame, col_name: str):
    df[col_name] = df[col_name].progress_apply(
        lambda x: deduce.deidentify_annotations(deduce.annotate_text(x, dates=False))
    )
    return df

In [None]:
data_folder = Path(
    "/mapr/administratielast/administratielast_datamanager/ontslagdocumentatie/"
)

## Apply DEDUCE to Metavision data
The code below applies DEDUCE to the metavision data. However, since the Metavision data is not completely loaded into the dataplatform, we need to work with a separate export, see [Apply DEDUCE to new Metavision data](#apply-deduce-to-new-metavision-data)

### Load data and apply DEDUCE

In [None]:
metavision_data = pd.read_json(
    data_folder / "metavision_extraction.json",
    convert_dates=["period_start", "period_end", "effectiveDateTime"],
    dtype={"subject_Patient_value": str},
)
metavision_data = apply_deduce(metavision_data, "valueString")
display(metavision_data)

### Save pseudonomised data

In [None]:
metavision_data[["pseudo_id", "subject_Patient_value"]].drop_duplicates().to_csv(
    data_folder / "pseudo_table.csv",
    index=False,
)

In [None]:
metavision_data.drop(columns="subject_Patient_value").to_csv(
    data_folder / "pseudonomised_metavision_data.csv",
    index=False,
)
metavision_data.drop(columns="subject_Patient_value").to_parquet(
    data_folder / "pseudonomised_metavision_data.parquet"
)

## Apply DEDUCE to HiX data

### Load discharge letters and apply DEDUCE

In [None]:
discharge_data = pd.read_json(
    data_folder / "hix_discharge_docs.json",
    convert_dates=["period_start", "period_end", "created"],
    dtype={"subject_Patient_value": str},
)
discharge_data = apply_deduce(discharge_data, "content_attachment1_plain_data")
display(discharge_data)

### Save pseudonomised data

In [None]:
discharge_data[["pseudo_id", "subject_Patient_value"]].drop_duplicates().to_csv(
    data_folder / "HiX_pseudo_table.csv",
    index=False,
)

In [None]:
discharge_data.drop(columns="subject_Patient_value").to_csv(
    data_folder / "pseudonomised_HiX_discharge_data.csv",
    index=False,
)
discharge_data.drop(columns="subject_Patient_value").to_parquet(
    data_folder / "pseudonomised_HiX_discharge_data.parquet",
)

### Load patient files and apply DEDUCE

In [None]:
hix_patient_files = pd.read_json(
    data_folder / "hix_patient_files.json",
    convert_dates=["period_start", "period_end", "created", "authored"],
    dtype={"subject_Patient_value": str},
)
hix_patient_files = apply_deduce(hix_patient_files, "item_answer_value_valueString")
display(hix_patient_files)

In [None]:
hix_patient_files.drop(columns="subject_Patient_value").to_csv(
    data_folder / "pseudonomised_HiX_patient_files.csv", index=False
)
hix_patient_files.drop(columns="subject_Patient_value").to_parquet(
    data_folder / "pseudonomised_HiX_patient_files.parquet"
)

## Apply DEDUCE to new metavision data

### Load data and fix types

In [None]:
metavision_admissions = pd.read_csv(
    data_folder / "2024-01-09 MV6 admissions LMM.csv",
    sep=";",
    parse_dates=["AddmissionDate", "DischargeDate"],
    dtype={"HospitalNumber": str},
)
metavision_freetext = pd.read_csv(
    data_folder / "2024-01-09 MV6 freetexts LMM.csv",
    sep=";",
    parse_dates=["Time", "ValidationTime"],
)

### Create pseudo_id and fix columns

In [None]:
metavision_admissions["pseudo_id"] = (
    metavision_admissions["HospitalNumber"].astype(str) + "aiva"
).apply(lambda x: sha256(x.encode("utf-16le")).hexdigest())

metavision_admissions = metavision_admissions.rename(
    columns={"HospitalNumber": "subject_Patient_value"}
).drop(columns=["LogicalUnitID"])

In [None]:
metavision_freetext = metavision_freetext.drop(columns=["LogicalUnitID", "CategoryID"])

### Merge datasets and apply DEDUCE

In [None]:
metavision_new = metavision_admissions.merge(metavision_freetext, on="PatientID").drop(
    columns="PatientID"
)
metavision_new = apply_deduce(metavision_new, "Value")

### Save pseudonomised data

In [None]:
metavision_new[["pseudo_id", "subject_Patient_value"]].drop_duplicates().to_csv(
    data_folder / "new_metavision_pseudo_table.csv", index=False
)

In [None]:
metavision_new.drop(columns="subject_Patient_value").to_csv(
    data_folder / "pseudonomised_new_metavision_data.csv", index=False
)
metavision_new.drop(columns="subject_Patient_value").to_parquet(
    data_folder / "pseudonomised_new_metavision_data.parquet"
)