# Use DEDUCE to pseudonomise text

In [None]:
from pathlib import Path

import deduce
import pandas as pd
from IPython.display import display

In [None]:
def apply_deduce(df: pd.DataFrame, col_name: str):
    df[col_name] = df[col_name].apply(
        lambda x: deduce.deidentify_annotations(deduce.annotate_text(x, dates=False))
    )
    return df

In [None]:
data_folder = Path(
    "/mapr/administratielast/administratielast_datamanager/ontslagdocumentatie/"
)

## Apply DEDUCE to Metavision data

In [None]:
metavision_data = pd.read_json(data_folder / "metavision_extraction.json")
metavision_data = apply_deduce(metavision_data, "valueString")
display(metavision_data)

In [None]:
metavision_data[["pseudo_id", "subject_Patient_value"]].to_csv(
    data_folder / "pseudo_table.csv",
    index=False,
)

In [None]:
metavision_data.drop(columns="subject_Patient_value").to_csv(
    data_folder / "pseudonomised_metavision_data.csv",
    index=False,
)

## Apply DEDUCE to HiX data

In [None]:
discharge_data = pd.read_json(data_folder / "hix_discharge_docs.json")
discharge_data = apply_deduce(discharge_data, "content_attachment1_plain_data")
display(discharge_data)

In [None]:
discharge_data[["pseudo_id", "subject_Patient_value"]].to_csv(
    data_folder / "HiX_pseudo_table.csv",
    index=False,
)

In [None]:
discharge_data.drop(columns="subject_Patient_value").to_csv(
    data_folder / "pseudonomised_HiX_discharge_data.csv",
    index=False,
)

In [None]:
hix_patient_files = pd.read_json(data_folder / "hix_patient_files.json")
hix_patient_files = apply_deduce(hix_patient_files, "item_answer_value_valueString")
display(hix_patient_files)

In [None]:
hix_patient_files.to_csv(
    data_folder / "pseudonomised_HiX_patient_files.csv", index=False
)