In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from tqdm import tqdm
import math

# 1. Cohort selection


Selecting all ICU-stays that had Pulmonary Embolism (PE) as their main (APACHE) admission diagnosis:

In [3]:
df_pat_big = pd.read_csv("../../eICU_data/patient.csv", low_memory=False)

In [4]:
[i for i in df_pat_big.apacheadmissiondx.value_counts().index.to_numpy() if "Embolus" in i]

['Embolus, pulmonary']

In [5]:
df_pat_big_pe = df_pat_big[df_pat_big.apacheadmissiondx == "Embolus, pulmonary"].copy()

In [6]:
lst_pat = df_pat_big_pe.patientunitstayid.to_list()

# 2. Reducing the file-size


## 2.1 Function(s)

In [7]:
def make_reduced_files(import_path, export_path,  notation, lst_ids):
    """
    Takes an import path (folder where all the eICU files are), an export path, a "Notation" (will be added behind the file name as an identifyer) and a list of ids and then returns the reduced
    files of EICU for only that population (except for the hospital table)

    :param import_path: String - Folder where all the eICU data is stored
    
    :param export_path: String - should only include the folder, not the filename and should end on "/"

    :param notation: String - short notation that will be added to each file-end for future identification

    :param lst_ids: List - list of target population patientunitstayids

    :return: Saves a list of abbreviated dataframe to the specified export_path
    """

    # all tables except the hospital as that is not connected to patientunitstayid
    lst_tables = ["admissionDrug", "admissionDx", "allergy", "apacheApsVar", "apachePatientResult", "apachePredVar",
                  "carePlanCareProvider", "carePlanEOL", "carePlanGeneral", "carePlanGoal", "carePlanInfectiousDisease", "customLab",
                  "diagnosis", "infusionDrug", "intakeOutput", "lab", "medication", "microLab", "note",
                  "nurseAssessment", "nurseCare", "nurseCharting", "pastHistory", "patient", "physicalExam",
                  "respiratoryCare", "respiratoryCharting", "treatment", "vitalAperiodic", "vitalPeriodic"]

    for table in tqdm(lst_tables):
        df_chunk = pd.read_csv("{}{}.csv".format(import_path, table), chunksize=100000, low_memory=False)
        lst_dataframes = []
        for chunk in df_chunk:
            df_temp = chunk[chunk["patientunitstayid"].isin(lst_ids)]

            lst_dataframes.append(df_temp)

        df_small = pd.concat(lst_dataframes)

        df_small.to_csv("{}{}_{}.csv".format(export_path, table, notation), index=False)

## 2.2. Implementation

In [8]:
make_reduced_files(
    import_path = "../../eICU_data/",
    export_path = "PE_data/",
    notation = "PE",
    lst_ids = lst_pat
)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [08:00<00:00, 16.02s/it]


# 3. Data extraction


## 3.1 Functions

In [9]:
def get_basic_patient_info(df_pat, lst_ids):
    """
    Takes the patient dataframe of the eICU database, a list of target patientunitstayids and
    returns a  dataframe with "cleaned" info per patientunitstayid

    :param df_pat: Dataframe - Patient dataframe of eICU (or abbreviated)

    :param lst_ids: List - list of populations patientunitstayids

    :return: Dataframe with the information
    """

    lst_columns = ["patientunitstayid", "uniquepid", "gender", "age", "ethnicity", "hospitalid", "wardid",
                   "unittype", "apacheadmissiondx", "admissionheight", "admissionweight", "hospitaldischargestatus",
                   'hospitaladmittime24', "hospitaladmitsource", "hospitaldischargelocation", 'unitadmittime24',
                   'unitadmitsource', 'unitstaytype', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus', "unitvisitnumber"]

    # reducing the general pat_df to not have such a big load with future calculations
    df_temp = df_pat.loc[df_pat["patientunitstayid"].isin(lst_ids), lst_columns].copy()

    # gender column
    df_temp["gender"] = df_temp["gender"].replace({"Unknown": np.nan, "Other": np.nan})

    # age column
    df_temp["age"] = df_temp["age"].replace("> 89", "90")
    df_temp["age"] = pd.to_numeric(df_temp["age"])

    # weight columns
    df_temp.loc[(df_temp.admissionweight <= 20) | (df_temp.admissionweight >= 300), ["admissionweight"]] = np.nan
    df_temp.loc[(df_temp.dischargeweight <= 20) | (df_temp.dischargeweight >= 300), ["dischargeweight"]] = np.nan

    # admissionheight
    df_temp.loc[(df_temp.admissionheight < 100) | (df_temp.admissionheight > 210), ["admissionheight"]] = np.nan
    
    # creating the BMI column
    df_temp["BMI"] = df_temp["admissionweight"] / (df_temp["admissionheight"] / 100) ** 2
    df_temp.loc[(df_temp.BMI > 100) | (df_temp.BMI < 12), "BMI"] = np.nan
    
    df_final = df_temp[["patientunitstayid", "gender", "age", "ethnicity", "BMI", "hospitaldischargestatus", "unitdischargestatus"]].copy()

    return df_final

In [10]:
def group_pmh_subcat(x, df, clm_name):
    df_one_id = df.loc[(df.patientunitstayid == x), [clm_name]]
    lst_diagnoses = df_one_id[clm_name].unique().tolist()
    final_string = "|".join(lst_diagnoses)
    return final_string

def split_and_rejoin_for_output(str_pmh):
    lst_strings = str_pmh.split("/")
    lst_final = lst_strings[6:]

    if len(lst_final) == 1:
        return lst_final[0]

    if len(lst_final) > 1:
        joined = "/".join(lst_final)
        return joined

def get_pastHistory(df_pmh, lst_ids):
    """
    receives the pastHistory Dataframe from eICU or abbreviated and a list of target patientunitstayids and returns
    a kind of longformat dataframe with the most important categories/PMH

    :param df_pmh: Dataframe - pastHistory dataframe from eICU

    :param lst_ids: List - list of patientunitstayids from the target population

    :return: Dataframe with patientunitstayids as the index

    """

    dict_subcat_clms = {
        'pmh_HT_with_treatment': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Hypertension Requiring Treatment'],
        'pmh_cancer': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Cancer'],
        'pmh_non_insulin_dep_DM': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Non-Insulin Dependent Diabetes'],
        'pmh_COPD': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'COPD'],
        'pmh_CHF': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Congestive Heart Failure'],
        'pmh_insulin_dep_DM': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Insulin Dependent Diabetes'],
        'pmh_arrhythmias': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Arrhythmias'],
        'pmh_hypothyroidism': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Hypothyroidism'],
        'pmh_MI': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Myocardial Infarction'],
        'pmh_strokes': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Strokes'],
        'pmh_renal_insuff': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Renal Insufficiency'],
        'pmh_PCI': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Procedural Coronary Intervention'],
        'pmh_card_valvular': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Valve disease'],
        'pmh_asthma': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Asthma'],
        'pmh_liver_cirrhosis': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 'Cirrhosis'],
        'pmh_renal_failure': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Renal Failure'],
        'pmh_CA_bypass': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Coronary Artery Bypass'],
        'pmh_seizures': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Seizures'],
        'pmh_periph_vasc_disease': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Peripheral Vascular Disease'],
        'pmh_home_o2': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Home Oxygen'],
        'pmh_venous_thrombosis': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Venous Thrombosis'],
        'pmh_dementia': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Dementia'],
        'pmh_pacemaker': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Pacemaker'],
        'pmh_cancer_therapy': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Cancer Therapy'],
        'pmh_angina': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Angina'],
        'pmh_peptic_ulcer_disease': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 'Peptic Ulcer Disease'],
        'pmh_TIAs': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'TIAs'],
        'pmh_PFTs': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Pulmonary Function Tests'],
        'pmh_resp_failure': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Respiratory Failure'],
        'pmh_AICD': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'AICD'],
        'pmh_PE': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Pulmonary Embolism'],
        'pmh_RA': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Rheumatoid Arthritis'],
        'pmh_mmunosuppression_last_6m': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'Immunosuppression within past 6 months'],
        'pmh_chronic_kidney_stones': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Chronic Stone Disease'],
        'pmh_neuromusk_disease': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Neuromuscular Disease'],
        'pmh_restrictive_lung_disease': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Restrictive Disease'],
        'pmh_s_p_NTx': ['notes/Progress Notes/Past History/Organ Systems/Renal', 's_p Renal Transplant'],
        'pmh_HIV_only': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'HIV only'],
        'pmh_hemolytic _anemia': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Hemolytic Anemia'],
        'pmh_SLE': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'SLE'],
        'pmh_exercise_tolerance': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Exercise Tolerance'],
        'pmh_intracranial_mass': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Intracranial Mass'],
        'pmh_hyperthyroidism': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Hyperthyroidism'],
        'pmh_recent_steroids_>10d': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Recent Steroid Use for > 10 days'],
        'pmh__petite_mal_seizures': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Seizures_petite mal seizures'],
        'pmh_s_p_LTx': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 's_p Liver Transplant'],
        'pmh_hypercoagulable_condition': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Hypercoagulable Condition'],
        'pmh_ITP': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'ITP'],
        'pmh_neurogenic_bladder': ['notes/Progress Notes/Past History/Organ Systems/Renal ', 'Neurogenic Bladder'],
        'pmh_sickle_cells': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Sickle Cell Disease'],
        'pmh_clotting_disorder': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Clotting Disorder'],
        'pmh_AIDS': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'AIDS'],
        'pmh_sarcoidosis': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Sarcoidosis'],
        'pmh_vasculitis': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Vasculitis'],
        'pmh_myeloproliferative_disease': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Myeloproliferative Disease'],
        'pmh_s_p_HTx': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 's_p Heart Transplant'],
        'pmh_aplastic_anemia': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Aplastic Anemia'],
        'pmh_hypercalcemia': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Hypercalcemia'],
        'pmh_hypersplenism': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 'Hypersplenism'],
        'pmh_scleroderma': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Scleroderma'],
        'pmh_RTA': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'RTA'],
        'pmh_s_p_lungTx': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 's_p Lung Transplant'],
        "pmh_cushings": ["notes/Progress Notes/Past History/Organ Systems/Endocrine", "Cushing's Syndrome"],
        'pmh_dermato': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Dermato']
    }

    df_reduced = df_pmh[df_pmh["patientunitstayid"].isin(lst_ids)]
    df_temp = df_reduced.loc[:,["patientunitstayid", "pasthistorypath", "pasthistoryvalue"]].copy()

    df_temp["pasthistorypath"] = df_temp["pasthistorypath"].str.replace("Hematology/Oncology", "Hematology-Oncology",
                                                                        regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("s/p", "s_p",
                                                                      regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("TIA(s)", "TIAs",
                                                                      regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("HIV (only)", "HIV only",
                                                                      regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("Recent Steroid Use (for > 10 days)", "Recent Steroid Use for > 10 days",
                                                                      regex=True)

    lst_columns = []

    # for the individual (upper diseases)
    for clm_name, key_phrases in tqdm(dict_subcat_clms.items()):
        # get the values per patientid in a column-wise approach
        key1 = key_phrases[0]
        key2 = key_phrases[1]

        df_new_clm_raw = df_temp.loc[(df_temp['pasthistorypath'].str.contains(key1, na=False)) &
                                     (df_temp['pasthistorypath'].str.contains(key2, na=False)), ["patientunitstayid", "pasthistorypath"]].copy()
        df_new_clm_raw.drop_duplicates(inplace=True)

        df_new_clm_raw["output_pmh_path"] = df_new_clm_raw["pasthistorypath"].apply(lambda x: split_and_rejoin_for_output(x))

        df_new_clm_reference = df_new_clm_raw.copy()

        df_new_clm_raw[clm_name] = df_new_clm_raw["patientunitstayid"].apply(lambda x: group_pmh_subcat(x, df_new_clm_reference, "output_pmh_path"))

        df_pat_w_data = df_new_clm_raw.loc[:, ["patientunitstayid", clm_name]].copy()
        df_pat_w_data.drop_duplicates(inplace=True)

        lst_pat_w_data = df_new_clm_raw.patientunitstayid.unique().tolist()
        lst_pat_no_data = [x for x in lst_ids if x not in lst_pat_w_data]

        df_pat_without_data = pd.DataFrame(lst_pat_no_data, columns=['patientunitstayid'])
        df_pat_without_data[clm_name] = 0

        df_clm_final = pd.concat([df_pat_w_data, df_pat_without_data])
        df_clm_final.set_index("patientunitstayid", inplace=True)

        lst_columns.append(df_clm_final)

    df_final = pd.concat(lst_columns, axis=1)
    
    df_final = df_final.reset_index()
    
    df_final = df_final[['patientunitstayid', 'pmh_HT_with_treatment', 'pmh_cancer',
       'pmh_non_insulin_dep_DM', 'pmh_COPD', 'pmh_CHF', 'pmh_insulin_dep_DM',
       'pmh_arrhythmias', 'pmh_MI', 'pmh_strokes', "pmh_hypothyroidism", 
       'pmh_renal_insuff', 'pmh_PCI', 'pmh_card_valvular', 'pmh_asthma',
       'pmh_liver_cirrhosis', 'pmh_renal_failure', 'pmh_CA_bypass',
       'pmh_seizures', 'pmh_periph_vasc_disease', 'pmh_home_o2',
       'pmh_venous_thrombosis', 'pmh_dementia', 'pmh_pacemaker',
       'pmh_cancer_therapy', 'pmh_angina', 
       'pmh_TIAs', 'pmh_resp_failure', 'pmh_AICD', 'pmh_PE',
       'pmh_neuromusk_disease', 'pmh_restrictive_lung_disease', 'pmh_hemolytic _anemia', 'pmh_intracranial_mass',
       'pmh_hyperthyroidism', 'pmh__petite_mal_seizures', 'pmh_hypercoagulable_condition', 'pmh_ITP', 
       'pmh_sickle_cells', 'pmh_clotting_disorder','pmh_aplastic_anemia', 'pmh_s_p_lungTx']]

    return df_final

In [11]:
def get_and_initial_clean_apacheApsVar(df_apsVar, lst_ids):
    """
    Receives the apacheApsVar Dataframe of the eICU database or abbreviated and a list of target patientunistayids and
    returns a dataframe with initially "cleaned" data: The -1 Values in several columns which denote for "no data was
    entered" were set to np.nan. Additionally, a GCS, a horowitz_index (pao2/fio2) and a BUN_creatinine_ratio column
    were added.

    :param df_apsVar: Dataframe - apacheApsVar Dataframe from eICU or abreviated

    :param lst_ids: List - list of target patientunitstayids

    :return: returns a dataframe with all those columns and patientunitstayids as index

    """

    lst_clms = ['eyes', 'motor', 'verbal', 'meds', 'temperature',
                'respiratoryrate',  'heartrate', 'meanbp']

    df_temp = df_apsVar.loc[df_apsVar["patientunitstayid"].isin(lst_ids), ['patientunitstayid', 'dialysis',
                                 'eyes', 'motor', 'verbal', 'meds',  'temperature',
                                 'respiratoryrate',  'heartrate', 'meanbp']].copy()

    for clm in lst_clms:
        df_temp.loc[df_temp[clm] == -1, [clm]] = np.nan

    df_temp["GCS"] = df_temp.eyes + df_temp.motor + df_temp.verbal

    lst_pat_with_data = df_temp.patientunitstayid.unique().tolist()
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]

    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=["patientunitstayid", 'dialysis',
                                                'eyes', 'motor', 'verbal', 'meds', 'temperature',
                                                'respiratoryrate', 'heartrate', 'meanbp', "GCS"])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data

    df_final = pd.concat([df_temp, df_pat_without_data])

    clms = ['dialysis', 'eyes', 'motor', 'verbal', 'meds', 'temperature', 'respiratoryrate', 'heartrate', 'meanbp', "GCS"]
                         
    df_final = df_final[["patientunitstayid", 'dialysis',
            'eyes', 'motor', 'verbal', 'meds', 'temperature',
            'respiratoryrate', 'heartrate', 'meanbp', "GCS"]].copy()

    for clm in clms:
        clm_new = "aps_{}".format(clm)
        df_final.rename(columns={clm: clm_new}, inplace=True)


    return df_final

In [12]:
def get_cleaned_apachePredVar_basics(df_predvar, lst_ids):
    """
    put in the apachePredVar Dataframe (or abbreviated) and a list of patientunitstayids and get a initally cleaned
    Dataframe with the infos for those unitstays. The dataframe will have the ids as index

    :param df_predvar: Dataframe - apachePredVar Dataframe or abbreviated

    :param lst_ids: List - List of patientunitstayids of the target population

    :return: Dataframe
    """

    df_reduced = df_predvar[df_predvar["patientunitstayid"].isin(lst_ids)]

    df_temp = df_reduced.loc[:,
              ['patientunitstayid', 'thrombolytics',
               'hepaticfailure', 'lymphoma', 'metastaticcancer', 'leukemia',
               'midur', 'oobintubday1']].copy()

    lst_pat_with_data = df_temp.patientunitstayid.unique().tolist()
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]

    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=['patientunitstayid', 'thrombolytics',
                                               'hepaticfailure', 'lymphoma', 'metastaticcancer', 'leukemia',
                                               'midur', 'oobintubday1'])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data

    df_final = pd.concat([df_temp, df_pat_without_data])

    clms = ['thrombolytics','hepaticfailure', 'lymphoma', 'metastaticcancer', 'leukemia', 'midur', 'oobintubday1']

    for clm in clms:
        clm_new = "pred_{}".format(clm)
        df_final.rename(columns={clm: clm_new}, inplace=True)


    return df_final

In [13]:
def get_cleaned_apachePatientResult_basics(df_apacheresult, lst_ids):
    """
    put in the apachePatientResult Dataframe (or abbreviated) and a list of patientunitstayids and get a initally cleaned
    Dataframe with the infos for those unitstays. The dataframe will have the ids as index

    :param df_predvar: Dataframe - apachePredVar Dataframe or abbreviated

    :param lst_ids: List - List of patientunitstayids of the target population

    :return: Dataframe
    """

    df_temp = df_apacheresult.loc[(df_apacheresult["apacheversion"] == "IVa") & (df_apacheresult["patientunitstayid"].isin(lst_ids)),
                             ['patientunitstayid', 'acutephysiologyscore', 'apachescore','actualiculos', 'predictedhospitalmortality',"predictedicumortality", 'unabridgedhosplos']].copy()

    lst_minus_ones = ['acutephysiologyscore', 'apachescore',"predictedicumortality", 'predictedhospitalmortality']

    for clm in lst_minus_ones:
        df_temp.loc[df_temp[clm] == -1, [clm]] = np.nan

    lst_pat_with_data = df_temp.patientunitstayid.unique().tolist()
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]

    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=['patientunitstayid', 'acutephysiologyscore', 'apachescore', 'actualiculos', 'predictedhospitalmortality', "predictedicumortality", 'unabridgedhosplos'])

    df_pat_without_data['patientunitstayid'] = lst_pat_without_data

    df_final = pd.concat([df_temp, df_pat_without_data])


    return df_final


In [14]:

def get_infusion_drugs_prototype(df_infusion, lst_ids, additional_dict=None, timeframe=(0, 1440)):
    """
    Get whether patients received a certain medication class in the specified tiemframe. Currently includes the following:

    "infusion_anticoag": "heparin|argatroban|angiomax|hepain",
    "infusion_vaso_ino": 'epinephrine|adrenaline|norepinephrine|levophed|dobutamine|dobutrex|vasopressin|isoprotenerol|isuprel|phenylephrine|neo-synephrine|dopamine|milrinone',
    "infusion_thrombolytic": "alteplase|activase|tpa|altaplase|altepase

    if additional meds /categories want to be added then the "additional_dict" parameter is there to add that

    :param df_infusion: Dataframe - infusionDrug Dataframe from eICU

    :param lst_ids: List - list of patientunitstayids

    :param additional_dict: Dict - additional/custom dict with the "clm_name": "drugname|drugname|drugname" format

    :param timeframe: Tuple, preset (0, 1440) - (lower offset, upper offset), offset bounds

    :return: Dataframe with the columns of who got which medication

    """

    lower_offset, upper_offset = timeframe

    df_reduced = df_infusion.query("patientunitstayid in @lst_ids and @lower_offset <= infusionoffset <= @upper_offset").copy()

    df_reduced.drugname = df_reduced.drugname.str.lower()

    dict_drugs = {
        "infusion_anticoag": "heparin|argatroban|angiomax|hepain",
        "infusion_vaso_ino": 'epinephrine|adrenaline|norepinephrine|levophed|dobutamine|dobutrex|vasopressin|isoprotenerol|isuprel|phenylephrine|neo-synephrine|dopamine|milrinone',
        "infusion_thrombolytic": "alteplase|activase|tpa|altaplase|altepase"}

    if additional_dict is not None:
        dict_drugs.update(additional_dict)

    lst_columns = []

    for clm_name, drug_str in dict_drugs.items():

        lst_pat_drug = df_reduced.loc[df_reduced.drugname.str.contains(drug_str), "patientunitstayid"].copy().unique().tolist()

        lst_pat_wo_drug = [i for i in lst_ids if i not in lst_pat_drug]

        df_pat_w_data = pd.DataFrame(lst_pat_drug, columns=['patientunitstayid'])
        df_pat_w_data[clm_name] = 1

        df_pat_without_data = pd.DataFrame(lst_pat_wo_drug, columns=['patientunitstayid'])
        df_pat_without_data[clm_name] = 0

        df_clm_final = pd.concat([df_pat_w_data, df_pat_without_data])
        df_clm_final.set_index("patientunitstayid", inplace=True)

        lst_columns.append(df_clm_final)

    df_final = pd.concat(lst_columns, axis=1)

    return df_final

In [15]:
def clean_median_labs(df_labs, dict_stayids_offsets, offset_relative, lab_name, lab_bounds, lab_realistic):
    """

    :param df_labs: Dataframe -  dataframe containing the labs (labs dataframe from eICU or abbreviated)

    :param dict_stayids_offsets: Dict - dictionary with the unique patientunitstayids of the population as keys and the
    individual offsets of the events as values

    :param offset_relative: Tuple, (earlier offset, later offset) - target time period relative to the individual
    offsets in dict_stayids_offsets

    :param lab_name: String - name of the lab value (contained in the "labname" column of df_labs)

    :param lab_bounds: Tuple, (lower bound, upper bound) - range for possible values (eg. for AST 0-1000000)

    :param lab_realistic: Tuple, (lower bound, upper bound) - range for realistic values (eg. for AST 1-10000)

    :return: Dataframe

    """

    # reduce the initial dataframe for future calculations
    lst_ids = [x for x in dict_stayids_offsets.keys()]
    df_lab = df_labs[df_labs["patientunitstayid"].isin(lst_ids)].copy()

    lower_bound, upper_bound = lab_bounds
    lower_realistic, upper_realistic = lab_realistic

    final_dict = {}

    for id, offset in dict_stayids_offsets.items():

        relative_lower_offset, relative_upper_offset = offset_relative

        lower_offset = offset + relative_lower_offset
        upper_offset = offset + relative_upper_offset

        df_temp = df_lab.loc[(df_lab["patientunitstayid"] == id) & (df_lab["labname"] == lab_name) &
                              (df_lab["labresultoffset"] >= lower_offset) & (
                                          df_lab["labresultoffset"] <= upper_offset), :]

        lst_labvalues = df_temp.labresult.tolist()
        lst_labvalues_bounds = [x for x in lst_labvalues if lower_bound <= x <= upper_bound]

        length = len(lst_labvalues_bounds)

        if length == 0:
            final_lab = np.nan

        if length == 1:
            lst_labvalues_realistic = [x for x in lst_labvalues if lower_realistic <= x <= upper_realistic]
            if len(lst_labvalues_realistic) == 1:
                final_lab = lst_labvalues_realistic[0]
            else:
                final_lab = np.nan

        if length in [2, 3, 4, 5]:
            lst_labvalues_realistic = [x for x in lst_labvalues if lower_realistic <= x <= upper_realistic]
            if len(lst_labvalues_realistic) == 0:
                final_lab = np.nan
            elif len(lst_labvalues_realistic) == 1:
                final_lab = lst_labvalues_realistic[0]
            else:
                labvalues_array = np.array(lst_labvalues_realistic)
                final_lab = np.median(labvalues_array)

        if length >= 6:
            labvalues_array = np.array(lst_labvalues_bounds)

            # to detect outliers for the patients own baseline:
            median_labs = np.median(labvalues_array)
            q25, q75 = np.percentile(labvalues_array, [25, 75])
            iqr = q75 - q25
            lower_outlier_bound = median_labs - iqr * 2
            upper_outlier_bound = median_labs + iqr * 2

            lst_before_outliers_removed = labvalues_array.tolist()
            lst_after_outliers_removed = [x for x in lst_before_outliers_removed if
                                          lower_outlier_bound <= x <= upper_outlier_bound]
            labvalues_array_after_outliers = np.array(lst_after_outliers_removed)

            final_lab = np.median(labvalues_array_after_outliers)

        final_dict[id] = final_lab

    df_final = pd.DataFrame(final_dict.items(), columns=["patientunitstayid", "labvalues"])
    df_final.set_index("patientunitstayid", inplace=True)

    return df_final

In [16]:
def clean_data(lst_values):
    """
    Discard outliers that are > median + 2 IQR or < median - 2 IQR

    :param lst_values: List - list of values (eg. mean BPs from a certain timeframe of a patient)

    :return: Array - numpy array of cleaned values

    """
    vitalvalues_array = np.array(lst_values)

    # return an array with 1 nan if the list (and therefore the column was empty)
    if vitalvalues_array.size == 0:
        vitalvalues_array_after_outliers = np.array([np.nan])

    # to detect outliers for the patients own baseline:
    else:
        median_vitals = np.median(vitalvalues_array)
        q25, q75 = np.percentile(vitalvalues_array, [25, 75])
        iqr = q75 - q25
        lower_outlier_bound = median_vitals - iqr * 2
        upper_outlier_bound = median_vitals + iqr * 2

        lst_before_outliers_removed = vitalvalues_array.tolist()
        lst_after_outliers_removed = [x for x in lst_before_outliers_removed if lower_outlier_bound <= x <= upper_outlier_bound]
        vitalvalues_array_after_outliers = np.array(lst_after_outliers_removed)

    return vitalvalues_array_after_outliers


def aggregated_vitals_function_for_map(x, df_reference, vital_name, offset, agg_timeunit, agg_total, timeunit=60):
    """
    Function for mapping (a lot faster) the values of patientunitstayid (x) in a dataframe to the resulting vital value. See the "parent
    function" get_cleaned_periodic_vitals_vectorized for more info

    :param x: String - passed patientunitstayid
    :param df_reference: Dataframe
    :param vital_name: String
    :param offset: Tuple
    :param agg_timeunit: String, picklist
    :param agg_total: String, picklist
    :param timeunit: int
    :return: float
    """
    # open the tuples, calculate duration
    lower_offset, upper_offset = offset
    duration = upper_offset - lower_offset

    # aggregate the values by timeunit (typically 1 hour first if whole duration is longer than 1 timeunit)
    if duration > timeunit:
        hours = int(duration) // timeunit
        lst_hourly_values = []

        for hour in range(hours + 1):

            new_lower_offset = lower_offset + hour * timeunit
            new_upper_offset = lower_offset + (hour + 1) * timeunit

            df_single_hour = df_reference.loc[
                (df_reference.patientunitstayid == x) & (df_reference["observationoffset"] >= new_lower_offset) &
                (df_reference["observationoffset"] <= new_upper_offset), [vital_name]]

            df_single_hour_no_na = df_single_hour.dropna()

            lst_vitals_pat = df_single_hour_no_na[vital_name].tolist()

            vitals_array = clean_data(lst_vitals_pat)

            if agg_timeunit == "median":
                hour_vital = np.median(vitals_array)
            if agg_timeunit == "max":
                hour_vital = np.max(vitals_array)
            if agg_timeunit == "min":
                hour_vital = np.min(vitals_array)

            lst_hourly_values.append(hour_vital)

        all_values_array = np.array(lst_hourly_values)
        all_values_array = all_values_array[~np.isnan(all_values_array)]

        if all_values_array.size == 0:
            all_values_array = np.array([np.nan])

        if agg_total == "median":
            final_vital = np.median(all_values_array)
        if agg_total == "max":
            final_vital = np.max(all_values_array)
        if agg_total == "min":
            final_vital = np.min(all_values_array)

    else:
        df_single_pat = df_reference.loc[df_reference.patientunitstayid == x, [vital_name]]
        df_single_pat_no_na = df_single_pat.dropna()
        lst_vitals_pat = df_single_pat_no_na[vital_name].tolist()

        vitals_array = clean_data(lst_vitals_pat)

        if agg_total == "median":
            final_vital = np.median(vitals_array)
        if agg_total == "max":
            final_vital = np.max(vitals_array)
        if agg_total == "min":
            final_vital = np.min(vitals_array)

    return final_vital


def spikes_function_for_map(x, df_reference, vital_name, offset, threshold, agg_unit, side_spike, timeunit):
    """
    mapping function for the count_spikes function. see that function for further documentation

    :param x: String - patientunitstayid passed to the mapping function
    :param df_reference: Dataframe
    :param vital_name: String
    :param offset: Tuple
    :param threshold: Int or float
    :param agg_unit: String, picklist
    :param side_spike: String, picklist
    :param timeunit: int
    :return: returns the spike count for the single patientunitstayid
    """

    # open the tuples, calculate duration
    lower_offset, upper_offset = offset
    duration = upper_offset - lower_offset

    # aggregate the values by timeunit (typically 1 hour first if whole duration is longer than 1 timeunit)
    if duration > timeunit:
        hours = int(duration) // timeunit
        lst_hourly_values = []

        for hour in range(hours + 1):

            new_lower_offset = lower_offset + hour * timeunit
            new_upper_offset = lower_offset + (hour + 1) * timeunit

            df_single_hour = df_reference.loc[
                (df_reference.patientunitstayid == x) & (df_reference["observationoffset"] >= new_lower_offset) &
                (df_reference["observationoffset"] <= new_upper_offset), [vital_name]]

            df_single_hour_no_na = df_single_hour.dropna()

            lst_vitals_pat = df_single_hour_no_na[vital_name].tolist()

            vitals_array = clean_data(lst_vitals_pat)

            if agg_unit == "median":
                hour_vital = np.median(vitals_array)
            if agg_unit == "max":
                hour_vital = np.max(vitals_array)
            if agg_unit == "min":
                hour_vital = np.min(vitals_array)

            lst_hourly_values.append(hour_vital)

        lst_hourly_no_na = [x for x in lst_hourly_values if not np.isnan(x)]

        if side_spike == "above":
            lst_spikes = [x for x in lst_hourly_no_na if x > threshold]
        else:
            lst_spikes = [x for x in lst_hourly_no_na if x < threshold]

        if len(lst_hourly_no_na) == 0:
            spike_count = np.nan
        else:
            spike_count = len(lst_spikes)
    else:
        print("ERROR: offset too small for timeunit")

    return spike_count


def get_cleaned_periodic_vitals_vectorized(df_periodic, lst_ids, vital_name, realistic_bounds, offset, agg_timeunit="median",
                                           agg_total="median", timeunit=60):
    """
        Takes the vitalPeriodic table of eICU, target patientunitstayids as a list, offset bounds, and realistic bounds and a vitalname and
        returns the aggregated and cleaned value for that offset timeframe. There are different options to choose from on how to aggregate that
        value. vitalnames:

        temperature
        sao2
        heartrate
        respiration
        cvp
        etco2
        systemicsystolic
        systemicdiastolic
        systemicmean
        pasystolic
        padiastolic
        pamean
        icp

        :param df_periodic: Dataframe - abbreviated (!) Dataframe (unless lots of free RAM) of the vitalPeriodic table of eICU

        :param lst_ids: List - list of patientunitstayids of the target population

        :param vital_name: String - Column name of the vital you are looking for

        :param realistic_bounds: Tuple - realistic values of the vital in the form of (lower bound, upper bound) eg. (20,100) for fio2

        :param offset: Tuple - (lower offest, upper offset)

        :param agg_timeunit: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values per hour (if the
                offset duration is longer than 1 hour)

        :param agg_total: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values to the final value

        :return: Dataframe - with patientunitstayids as the index
        """

    # reduce the dataframe
    df_reduced = df_periodic[df_periodic["patientunitstayid"].isin(lst_ids)]

    # open the tuples
    lower_realistic_bound, upper_realistic_bound = realistic_bounds
    lower_offset, upper_offset = offset

    # next reduction, incorporating the big offset bounds and the realistic bounds
    df_temp = df_reduced.loc[(df_reduced[vital_name] >= lower_realistic_bound) & (df_reduced[vital_name] <= upper_realistic_bound) &
                             (df_reduced["observationoffset"] >= lower_offset) & (df_reduced["observationoffset"] <= upper_offset),
                             ["patientunitstayid", "observationoffset", vital_name]].copy()

    # make a dataframe with a single column containing the ids
    df_pat = pd.DataFrame({'patientunitstayid': lst_ids})

    clm_name = "{}_{}to{}".format(vital_name, lower_offset, upper_offset)

    # map the function to all columns
    df_pat[clm_name] = df_pat["patientunitstayid"].map(
        lambda x: aggregated_vitals_function_for_map(x, df_temp, vital_name, offset, agg_timeunit, agg_total, timeunit))

    df_pat.set_index("patientunitstayid", inplace=True)

    return df_pat


def count_spikes_periodic(df_periodic, lst_ids, vital_name, realistic_bounds, offset, threshold, agg_unit="median", side_spike="below", timeunit=60):
    """
    Function that takes a list of patientunitstayids, a vitalname and a threshold (+ other fine tuning input) and returns a count of times
    the vital (aggregated over a certain time period) crossedd over/under that threshold

    :param df_periodic: Dataframe - vitalPeriodic Dataframe of the eICU Database (abbreviated!! if you dont have loads of RAM)

    :param lst_ids: List - list of patientunitstayids

    :param vital_name: String - name of the column of the vital

    :param realistic_bounds: Tuple - (lower realistic bound, upper realistic bound)

    :param offset: Tuple - (lower offset bound, upper offset bound)

    :param threshold: int (or float) -  timeunit under which the values are first aggregated before looking at how many timeunits
        in the whole offsets were spiking

    :param agg_unit: String, picklist, preset "median" - How to aggregate the timeunit ("median", "max" or "min")

    :param side_spike: String, picklist, preset "below" - Either "above" or "below" -> in which directions the spikes should be detected

    :param timeunit: Int, preset 60 - number of minutes for the timunit

    :return: Dataframe - one column with the patientunitstayids as index
    """
    # reduce the dataframe
    df_reduced = df_periodic[df_periodic["patientunitstayid"].isin(lst_ids)]

    # open the tuples
    lower_realistic_bound, upper_realistic_bound = realistic_bounds
    lower_offset, upper_offset = offset

    # next reduction, incorporating the big offset bounds and the realistic bounds
    df_temp = df_reduced.loc[(df_reduced[vital_name] >= lower_realistic_bound) & (df_reduced[vital_name] <= upper_realistic_bound) &
                             (df_reduced["observationoffset"] >= lower_offset) & (df_reduced["observationoffset"] <= upper_offset),
                             ["patientunitstayid", "observationoffset", vital_name]].copy()

    # make a dataframe with a single column containing the ids
    df_pat = pd.DataFrame({'patientunitstayid': lst_ids})

    clm_name = "spikes_{}_{}_{}_{}to{}".format(side_spike, threshold, vital_name, lower_offset, upper_offset)

    # map the function to all columns
    df_pat[clm_name] = df_pat["patientunitstayid"].map(
        lambda x: spikes_function_for_map(x, df_temp, vital_name, offset, threshold, agg_unit, side_spike, timeunit))

    df_pat.set_index("patientunitstayid", inplace=True)

    return df_pat

In [17]:
def get_cleaned_combined_vitals_vectorized(df_periodic, df_aperiodic, lst_ids, vital_name, realistic_bounds, offset, agg_timeunit="median",
                                           agg_total="median", timeunit=60):
    """
        Takes the both the vitalPeriodic and Aperiodic tables of eICU, target patientunitstayids as a list, offset bounds, and realistic bounds and a vitalname and
        returns the aggregated and cleaned value for that offset timeframe. There are different options to choose from on how to aggregate that
        value. vitalnames:

        :param df_periodic: Dataframe - vitalPeriodic Dataframe of the eICU Database (abbreviated!! if you dont have loads of RAM)

        :param df_aperiodic: Dataframe - vitalAperiodic Dataframe

        :param lst_ids: List - list of patientunitstayids of the target population

        :param vital_name: String, picklist - options "systolic", "diastolic", "mean_bp"

        :param realistic_bounds: Tuple - realistic values of the vital in the form of (lower bound, upper bound) eg. (20,100) for fio2

        :param offset: Tuple - (lower offest, upper offset)

        :param agg_timeunit: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values per hour (if the
                offset duration is longer than 1 hour)

        :param agg_total: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values to the final value

        :return: Dataframe - with patientunitstayids as the index
        """

    dict_vitalnames = {
        "systolic": ("systemicsystolic", "noninvasivesystolic"),
        "diastolic": ("systemicdiastolic", "noninvasivediastolic"),
        "mean_bp": ("systemicmean", "noninvasivemean")
    }

    # reduce the dataframe
    df_reduced_periodic = df_periodic[df_periodic["patientunitstayid"].isin(lst_ids)]
    df_reduced_aperiodic = df_aperiodic[df_aperiodic["patientunitstayid"].isin(lst_ids)]

    # open the tuples
    lower_realistic_bound, upper_realistic_bound = realistic_bounds
    lower_offset, upper_offset = offset
    key_periodic, key_aperiodic = dict_vitalnames[vital_name]

    # next reduction, incorporating the big offset bounds and the realistic bounds
    df_temp_periodic = df_reduced_periodic.loc[
        (df_reduced_periodic[key_periodic] >= lower_realistic_bound) & (df_reduced_periodic[key_periodic] <= upper_realistic_bound) &
        (df_reduced_periodic["observationoffset"] >= lower_offset) & (df_reduced_periodic["observationoffset"] <= upper_offset),
        ["patientunitstayid", "observationoffset", key_periodic]].copy()
    df_temp_aperiodic = df_reduced_aperiodic.loc[
        (df_reduced_aperiodic[key_aperiodic] >= lower_realistic_bound) & (df_reduced_aperiodic[key_aperiodic] <= upper_realistic_bound) &
        (df_reduced_aperiodic["observationoffset"] >= lower_offset) & (df_reduced_aperiodic["observationoffset"] <= upper_offset),
        ["patientunitstayid", "observationoffset", key_aperiodic]].copy()

    df_temp_periodic.rename(columns={key_periodic: vital_name}, inplace=True)
    df_temp_aperiodic.rename(columns={key_aperiodic: vital_name}, inplace=True)

    df_temp = pd.concat([df_temp_periodic, df_temp_aperiodic])

    # make a dataframe with a single column containing the ids
    df_pat = pd.DataFrame({'patientunitstayid': lst_ids})

    clm_name = "{}_{}to{}".format(vital_name, lower_offset, upper_offset)

    # map the function to all columns
    df_pat[clm_name] = df_pat["patientunitstayid"].map(
        lambda x: aggregated_vitals_function_for_map(x, df_temp, vital_name, offset, agg_timeunit, agg_total, timeunit))

    df_pat.set_index("patientunitstayid", inplace=True)

    return df_pat


def count_spikes_combined(df_periodic, df_aperiodic, lst_ids, vital_name, realistic_bounds, offset, threshold, agg_unit="median", side_spike="below",
                 timeunit=60):
    """
    Function that takes a list of patientunitstayids, a vitalname and a threshold (+ other fine tuning input) and returns a count of times
    the vital (aggregated over a certain time period) crossedd over/under that threshold

    :param df_periodic: Dataframe - vitalPeriodic Dataframe of the eICU Database (abbreviated!! if you dont have loads of RAM)

    :param df_aperiodic: Dataframe - vitalAperiodic Dataframe

    :param lst_ids: List - list of patientunitstayids

    :param vital_name: String, picklist - options "systolic", "diastolic", "mean_bp"

    :param realistic_bounds: Tuple - (lower realistic bound, upper realistic bound)

    :param offset: Tuple - (lower offset bound, upper offset bound)

    :param threshold: int (or float) -  timeunit under which the values are first aggregated before looking at how many timeunits
        in the whole offsets were spiking

    :param agg_unit: String, picklist, preset "median" - How to aggregate the timeunit ("median", "max" or "min")

    :param side_spike: String, picklist, preset "below" - Either "above" or "below" -> in which directions the spikes should be detected

    :param timeunit: Int, preset 60 - number of minutes for the timunit

    :return: Dataframe - one column with the patientunitstayids as index
    """

    dict_vitalnames = {
        "systolic": ("systemicsystolic", "noninvasivesystolic"),
        "diastolic": ("systemicdiastolic", "noninvasivediastolic"),
        "mean_bp": ("systemicmean", "noninvasivemean")
    }

    # reduce the dataframe
    df_reduced_periodic = df_periodic[df_periodic["patientunitstayid"].isin(lst_ids)]
    df_reduced_aperiodic = df_aperiodic[df_aperiodic["patientunitstayid"].isin(lst_ids)]

    # open the tuples
    lower_realistic_bound, upper_realistic_bound = realistic_bounds
    lower_offset, upper_offset = offset
    key_periodic, key_aperiodic = dict_vitalnames[vital_name]

    # next reduction, incorporating the big offset bounds and the realistic bounds
    df_temp_periodic = df_reduced_periodic.loc[
        (df_reduced_periodic[key_periodic] >= lower_realistic_bound) & (df_reduced_periodic[key_periodic] <= upper_realistic_bound) &
        (df_reduced_periodic["observationoffset"] >= lower_offset) & (df_reduced_periodic["observationoffset"] <= upper_offset),
        ["patientunitstayid", "observationoffset", key_periodic]].copy()
    df_temp_aperiodic = df_reduced_aperiodic.loc[
        (df_reduced_aperiodic[key_aperiodic] >= lower_realistic_bound) & (df_reduced_aperiodic[key_aperiodic] <= upper_realistic_bound) &
        (df_reduced_aperiodic["observationoffset"] >= lower_offset) & (df_reduced_aperiodic["observationoffset"] <= upper_offset),
        ["patientunitstayid", "observationoffset", key_aperiodic]].copy()

    df_temp_periodic.rename(columns={key_periodic: vital_name}, inplace=True)
    df_temp_aperiodic.rename(columns={key_aperiodic: vital_name}, inplace=True)

    df_temp = pd.concat([df_temp_periodic, df_temp_aperiodic])

    # make a dataframe with a single column containing the ids
    df_pat = pd.DataFrame({'patientunitstayid': lst_ids})

    clm_name = "spikes_{}_{}_{}_{}to{}".format(side_spike, threshold, vital_name, lower_offset, upper_offset)

    # map the function to all columns
    df_pat[clm_name] = df_pat["patientunitstayid"].map(
        lambda x: spikes_function_for_map(x, df_temp, vital_name, offset, threshold, agg_unit, side_spike, timeunit))

    df_pat.set_index("patientunitstayid", inplace=True)

    return df_pat

In [18]:
def apply_icu_free_days(x, df_pat_ref):
    
    # variables to get the other possible stays
    uniquepid = x.uniquepid
    visitnumber = x.unitvisitnumber
    hospid = x.hospitalid
    hospadmittime = x.hospitaladmittime24
    hospadmitsource = x.hospitaladmitsource
    hosp_year = x.hospitaldischargeyear
    hospdischargetime = x.hospitaldischargetime24
    
    # variables for the later calculation that should be done in minutes.
    unit_death = x.unitdischargestatus
    hospital_death = x.hospitaldischargestatus
    hosp_discharge = x.hospitaldischargeoffset
    orig_unit_discharge = x.unitdischargeoffset
    
    
    
    
    # minutes after hospitaladmission that the patient was discharged
    dischargeoffset_from_hosp_admission =  x.hospitaladmitoffset - x.unitdischargeoffset
    maximum_hospoffset = x.hospitaladmitoffset - (60*24*30)
    
    # the data has to be from the same patient (same uniquepid), has to be from the same hospital-stay overall which is identified by 
    # several several variables (hospitalid, hospitaladmittime etc.), has to be a visit after the current unitvisit (therefore higher visitnumber) and
    # has to happen after the patient was discharged from the first visit (so the hospitaladmitoffset has to be smaller (as it is negative) than 
    # the minutes after hospitaladmission that the patient was discharged)
    unique_ref_data = df_pat_ref.loc[(df_pat_ref.uniquepid == uniquepid) &
                                     (df_pat_ref.hospitalid == hospid) &
                                     (df_pat_ref.hospitaladmittime24 == hospadmittime) &
                                     (df_pat_ref.hospitaladmitsource == hospadmitsource) &
                                     (df_pat_ref.hospitaldischargeyear == hosp_year) &
                                     (df_pat_ref.hospitaldischargetime24 == hospdischargetime) &
                                     
                                     (df_pat_ref.unitvisitnumber > visitnumber) &
                                     (df_pat_ref.hospitaladmitoffset < dischargeoffset_from_hosp_admission) &
                                     (df_pat_ref.hospitaladmitoffset > maximum_hospoffset), :].copy().sort_values("hospitaladmitoffset", ascending=False)
    
    if len(unique_ref_data) > 0:
        lst_other_stays = list(zip(unique_ref_data["hospitaladmitoffset"], unique_ref_data["unitdischargeoffset"]))
        
    minutes_d = 60*24

    if unit_death == 1: 
        return 0
    
    if len(unique_ref_data) == 0:
        if hospital_death == 0:
            icu_free_days = ((30*minutes_d) - orig_unit_discharge) / minutes_d

            if icu_free_days < 0:
                return 0
            else:
                return icu_free_days

        if hospital_death == 1:
            if hosp_discharge < (30*minutes_d):
                icu_free_days = (hosp_discharge - orig_unit_discharge) / minutes_d
                if icu_free_days < 0:
                    return 0
                else:
                    return icu_free_days

            else: 
                icu_free_days = ((30*minutes_d) - orig_unit_discharge) / minutes_d

                if icu_free_days < 0:
                    return 0
                else:
                    return icu_free_days

    else:
        last_stay_offset, last_stay_duration = lst_other_stays[-1]
        end_last_stay = -1*last_stay_offset + last_stay_duration

        duration_all_other_stays = sum([tup[1] for tup in lst_other_stays])
        duration_all_other_except_last = sum([tup[1] for tup in lst_other_stays[:-1]])

        if end_last_stay < (30*minutes_d):
            icu_free_days = ((30*minutes_d) - orig_unit_discharge - duration_all_other_stays) / minutes_d
            
            if icu_free_days < 0:
                return 0
            else:
                return icu_free_days

        if hospital_death == 0:
            if end_last_stay < (30*minutes_d):
                icu_free_days = ((30*minutes_d) - orig_unit_discharge - duration_all_other_stays) / minutes_d
            else:
                icu_free_days = ((30*minutes_d) - orig_unit_discharge - duration_all_other_except_last) / minutes_d

            if icu_free_days < 0:
                return 0
            else:
                return icu_free_days

        if hospital_death == 1:
            if hosp_discharge < (30*minutes_d):
                icu_free_days = (hosp_discharge - orig_unit_discharge - duration_all_other_stays) / minutes_d

                if icu_free_days < 0:
                    return 0
                else:
                    return icu_free_days

            else: 

                if end_last_stay < (30*minutes_d):
                    icu_free_days = ((30*minutes_d) - orig_unit_discharge - duration_all_other_stays) / minutes_d
                else:
                    icu_free_days = ((30*minutes_d) - orig_unit_discharge - duration_all_other_except_last) / minutes_d

                if icu_free_days < 0:
                    return 0
                else:
                    return icu_free_days
    
    
    return np.nan


def get_icu_free_days(df_pat, lst_ids):
    """
    Takes the patient dataframe and a list of patientunitstasyids and returns da dataframe where one column are the patientunitstayids and the other one called "ICU_freee_days"
    contains the calculated ICU-free days
    
    :param df_pat: Dataframe - Patient dataframe of eICU (or abbreviated)

    :param lst_ids: List - list of populations patientunitstayids

    :return: Dataframe   
    
    """
    
    df_stays_ICU_free_d = df_pat.loc[
        (df_pat.patientunitstayid.isin(lst_ids)),
        ["patientunitstayid", "hospitaldischargestatus",  
         "unitdischargestatus", "uniquepid", "unitvisitnumber", "hospitalid", "hospitaladmittime24", 
         "hospitaladmitsource", "hospitaldischargetime24", "hospitaldischargeyear", "hospitaladmitoffset", 
         "unitdischargeoffset", "hospitaldischargeoffset"]].copy()
    
    df_stays_ICU_free_d.unitdischargestatus = df_stays_ICU_free_d.unitdischargestatus.map({"Alive": 0, "Expired":1})
    df_stays_ICU_free_d.hospitaldischargestatus = df_stays_ICU_free_d.hospitaldischargestatus.map({"Alive": 0, "Expired":1})

    lst_uniqueids = list(df_stays_ICU_free_d.uniquepid.unique())

    df_ref_for_map_ICU_free_d = df_pat[df_pat.uniquepid.isin(lst_uniqueids)].copy()
    
    df_stays_ICU_free_d["ICU_free_days"] = df_stays_ICU_free_d.apply(lambda x: apply_icu_free_days(x, df_ref_for_map_ICU_free_d), axis=1)
    
    df_final = df_stays_ICU_free_d[["patientunitstayid", "ICU_free_days"]].copy()
    
    return df_final

## 3.2  Demographic Data

In [19]:
df_pat = pd.read_csv("PE_data/patient_PE.csv", low_memory=False)

In [20]:
df_patinfo_PE = get_basic_patient_info(df_pat, lst_pat)

In [21]:
df_patinfo_PE.loc[~(df_patinfo_PE.ethnicity.isin(df_patinfo_PE.ethnicity.value_counts().index[:2].to_list())), "ethnicity"] = "Unknown/Other"
df_patinfo_PE.ethnicity = df_patinfo_PE.ethnicity.fillna("Other/Unknown")
df_patinfo_PE.hospitaldischargestatus = df_patinfo_PE.hospitaldischargestatus.map({"Expired": 1, "Alive": 0})
df_patinfo_PE.unitdischargestatus = df_patinfo_PE.unitdischargestatus.map({"Expired": 1, "Alive": 0})

## 3.3 Comorbidities 

In [22]:
df_pmh = pd.read_csv("PE_data/pastHistory_PE.csv", low_memory=False)

In [23]:
df_pmhinfo = get_pastHistory(df_pmh, lst_pat)

100%|██████████████████████████████████████████████████████████████████████████████████| 64/64 [00:03<00:00, 19.11it/s]


In [24]:
df_pe = pd.merge(
    left=df_patinfo_PE,
    right=df_pmhinfo,
    how="left",
    on="patientunitstayid"
)

## 3.4 APACHE IVa, APS and associated variables

In [25]:
df_apsVar = pd.read_csv("PE_data/apacheApsVar_PE.csv", low_memory=False)

In [26]:
df_apsvarinfo = get_and_initial_clean_apacheApsVar(df_apsVar, lst_pat)

In [27]:
df_pe = df_pe.merge(
    right=df_apsvarinfo,
    how="left",
    on="patientunitstayid"
)

In [28]:
df_predVar = pd.read_csv("PE_data/apachePredVar_PE.csv", low_memory=False)

In [29]:
df_predvarinfo = get_cleaned_apachePredVar_basics(df_predVar, lst_pat)

In [30]:
df_pe = df_pe.merge(
    right=df_predvarinfo,
    how="left",
    on="patientunitstayid"
)

In [31]:
df_apacheresult = pd.read_csv("PE_data/apachePatientResult_PE.csv", low_memory=False)

In [32]:
df_apacheresult_info = get_cleaned_apachePatientResult_basics(df_apacheresult, lst_pat)

In [33]:
df_pe = df_pe.merge(
    right=df_apacheresult_info,
    how="left",
    on="patientunitstayid"
)

## 3.5 Infusions

In [34]:
df_infusions = pd.read_csv("PE_data/infusionDrug_PE.csv", low_memory=False)

In [35]:
df_infusions_info = get_infusion_drugs_prototype(df_infusions, lst_pat).reset_index()

In [36]:
df_pe = df_pe.merge(
    right=df_infusions_info,
    how="left",
    on="patientunitstayid"
)

## 3.6 Laboratory values

In [37]:
df_labs = pd.read_csv("PE_data/lab_PE.csv", low_memory=False)

In [38]:
dict_pat_w_offset = {pat: 0 for pat in lst_pat}

In [39]:
dict_labs = {"potassium": [(0, 100), (1, 10)],
             "bedside glucose": [(0, 100000), (35, 2000)],
             "glucose": [(0, 100000), (35, 2000)],
             "Hgb": [(0, 100), (4, 20)],
             "creatinine": [(0, 100), (0.1, 20)],
             "BUN": [(0, 1000), (1, 150)],
             "platelets x 1000": [(0, 10000), (5, 3000)],
             "bicarbonate": [(0, 1000), (5, 40)],
             }

lst_lab_clms = []

for lab, bounds in tqdm(dict_labs.items()):
    lab_bounds = bounds[0]
    realistic_bounds = bounds[1]
    df_new_lab_clm = clean_median_labs(df_labs=df_labs,
                                      dict_stayids_offsets=dict_pat_w_offset,
                                      offset_relative=(0, 1440),
                                      lab_name=lab,
                                      lab_bounds=lab_bounds,
                                      lab_realistic=realistic_bounds
                                      )

    new_column_name = "{}_median_0to1440".format(lab)

    df_new_lab_clm = df_new_lab_clm.rename(columns={"labvalues": new_column_name})

    lst_lab_clms.append(df_new_lab_clm)
    
df_labsinfo = pd.concat(lst_lab_clms, axis=1)
df_labsinfo["glucose_median_0to1440"] = df_labsinfo["glucose_median_0to1440"].fillna(df_labsinfo["bedside glucose_median_0to1440"])
df_labsinfo = df_labsinfo.reset_index()


100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [03:33<00:00, 26.63s/it]


In [40]:
df_pe = df_pe.merge(
    right=df_labsinfo,
    how="left",
    on="patientunitstayid"
)

## 3.7 Own vitals

In [41]:
df_periodic = pd.read_csv("PE_data/vitalPeriodic_PE.csv", low_memory=False)
df_a_periodic = pd.read_csv("PE_data/vitalAperiodic_PE.csv", low_memory=False)

In [42]:
dict_vitals = {
    "heartrate": [(20, 200), 109, "above"],
    "temperature": [(32, 43), 36, "below"],
    "respiration": [(3, 80), 29, "above"],
    "sao2": [(50, 100), 91, "below"]
}

dict_vitals_combined = {
    "systolic": [(20, 250), 100, "below"]
}

lst_clms_pesi_vitals = []

# for the values that only need the vitals periodic datafile
for key, values in tqdm(dict_vitals.items()):
    realistic = values[0]
    threshold = values[1]
    side = values[2]
    clm_name = "pesi_{}_30_median".format(key)
    internal_clm_name = "spikes_{}_{}_{}_0to1440".format(side, threshold, key)

    df_temp = count_spikes_periodic(df_periodic=df_periodic,
                                   lst_ids=lst_pat,
                                   vital_name=key,
                                   realistic_bounds=realistic,
                                   offset=(0, 1440),
                                   threshold=threshold,
                                   agg_unit="median",
                                   side_spike=side,
                                   timeunit=30)

    df_temp[clm_name] = df_temp[internal_clm_name].map(lambda x: 1 if x > 0 else 0)

    df_clm = df_temp.loc[:, [clm_name]].copy()

    lst_clms_pesi_vitals.append(df_clm)

# for the systolic blood pressure that needs the combined data files
for key, values in tqdm(dict_vitals_combined.items()):
    realistic = values[0]
    threshold = values[1]
    side = values[2]
    clm_name = "pesi_{}_30_median".format(key)
    internal_clm_name = "spikes_{}_{}_{}_0to1440".format(side, threshold, key)

    df_temp = count_spikes_combined(df_periodic=df_periodic,
                                     df_aperiodic=df_a_periodic,
                                     lst_ids=lst_pat,
                                     vital_name=key,
                                     realistic_bounds=realistic,
                                     offset=(0, 1440),
                                     threshold=threshold,
                                     agg_unit="median",
                                     side_spike=side,
                                     timeunit=30)

    df_temp[clm_name] = df_temp[internal_clm_name].map(lambda x: 1 if x > 0 else 0)

    df_clm = df_temp.loc[:, [clm_name]].copy()

    lst_clms_pesi_vitals.append(df_clm)

df_final_vitals_pesi = pd.concat(lst_clms_pesi_vitals, axis=1)

df_final_vitals_pesi = df_final_vitals_pesi.reset_index().rename(columns={"index": "patientunitstayid"})

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [13:02<00:00, 195.57s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [02:07<00:00, 127.58s/it]


In [43]:
df_pe = df_pe.merge(
    right=df_final_vitals_pesi,
    how="left",
    on="patientunitstayid"
)

In [44]:
# respiratory rate, SaO2, HR, Sys, mean, dias BP
dict_vitals = {
    "heartrate": (20, 200),
    "respiration": (3, 80),
    "sao2": (50, 100)
}

dict_vitals_combined = {
    "systolic": (20, 250),
    "diastolic": (5, 180),
    "mean_bp": (10, 200)
}

lst_clms_vitals_general = []

# for the values that only need the vitals periodic datafile
for key, values in tqdm(dict_vitals.items()):
    
    df_temp = get_cleaned_periodic_vitals_vectorized(df_periodic=df_periodic,
                                                     lst_ids=lst_pat,
                                                     vital_name=key,
                                                     realistic_bounds=realistic,
                                                     offset=(0, 1440),
                                                     agg_timeunit="median",
                                                     agg_total="median",
                                                     timeunit=30)
  
    internal_clm_name = "{}_0to1440".format(key)
    new_clm_name = "{}_median_0to1440_u30".format(key)

    df_temp = df_temp.rename(columns={internal_clm_name: new_clm_name})
    lst_clms_vitals_general.append(df_temp)

# for the blood pressures that need the combined data files
for key, realistic in tqdm(dict_vitals_combined.items()):
    df_temp = get_cleaned_combined_vitals_vectorized(df_periodic=df_periodic,
                                                   df_aperiodic=df_a_periodic,
                                                   lst_ids=lst_pat,
                                                   vital_name=key,
                                                   realistic_bounds=realistic,
                                                   offset=(0, 1440),
                                                   agg_timeunit="median",
                                                   agg_total="median",
                                                   timeunit=30)

    internal_clm_name = "{}_0to1440".format(key)
    new_clm_name = "{}_median_0to1440_u30".format(key)

    df_temp = df_temp.rename(columns={internal_clm_name: new_clm_name})
    lst_clms_vitals_general.append(df_temp)


df_final_vitals_general = pd.concat(lst_clms_vitals_general, axis=1)

df_final_vitals_general = df_final_vitals_general.reset_index().rename(columns={"index": "patientunitstayid"})

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [09:28<00:00, 189.52s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [07:20<00:00, 146.76s/it]


In [45]:
df_pe = df_pe.merge(
    right=df_final_vitals_general,
    how="left",
    on="patientunitstayid"
)

In [46]:
df_pe.shape

(1697, 93)

## 3.8 ICU free days

In [47]:
df_icufree_d_info = get_icu_free_days(df_pat_big, lst_pat)

In [48]:
df_pe = df_pe.merge(
    right=df_icufree_d_info,
    how="left",
    on="patientunitstayid"
)

# 4. PESI score calculation

## 4.1 Functions

In [49]:
def map_vitals_from_two_sources(x, df_ref_data, clm_aps, clm_own, threshhold, counterthreshold, side):
    """
    Map vitals to a list of patientunitstayids from 2 sources (in that case primarily the aps column and then under certain condition form
    an own column

    :param x: x - lambda x parameter
    :param df_ref_data: Dataframe
    :param clm_aps: String
    :param clm_own: String
    :param threshhold: Int/Float
    :param counterthreshold: Int/Float
    :param side: String, picklist
    :return: value for mapping
    """

    value_aps = df_ref_data.loc[x][clm_aps]
    value_own = df_ref_data.loc[x][clm_own]

    output = 0

    if side == "above":
        if value_aps > threshhold:
            output = 1
        else:
            if value_aps < counterthreshold:
                if value_own > threshhold:
                    output = 1
            if value_aps == np.nan and value_own == np.nan:
                output = np.nan

    if side == "below":
        if value_aps < threshhold:
            output = 1
        else:
            if value_aps > counterthreshold:
                if value_own < threshhold:
                    output = 1
            if value_aps == np.nan and value_own == np.nan:
                output = np.nan

    return output

## 4.2 Calculation

In [50]:
# the demographic components
df_pe["PESI_age"] = df_pe["age"]
df_pe["PESI_gender"] = df_pe["gender"].map({"Male": 1, "Female":0})

In [51]:
## the pmh components
# the chronic pulmonary column
df_pe["PESI_pulm"] = 0
df_pe.loc[(df_pe['pmh_COPD'] != 0) | (df_pe['pmh_asthma'] != 0) | (df_pe['pmh_home_o2'] != 0) |
          (df_pe['pmh_restrictive_lung_disease'] != 0) | (df_pe['pmh_s_p_lungTx'] != 0), ["PESI_pulm"]] = 1
df_pe.loc[(df_pe['pmh_COPD'].isna()) & (df_pe['pmh_asthma'].isna()) & (df_pe['pmh_home_o2'].isna()) &
          (df_pe['pmh_restrictive_lung_disease'].isna()) & (df_pe['pmh_s_p_lungTx'].isna()), ["PESI_pulm"]] = np.nan

# HF column
df_pe["PESI_hf"] = 0
df_pe.loc[df_pe.pmh_CHF != 0, ["PESI_hf"]] = 1
df_pe.loc[df_pe.pmh_CHF.isna(), ["PESI_hf"]] = np.nan

df_pe["PESI_hf"].value_counts()

# cancer column
df_pe["PESI_cancer"] = 0

df_pe.loc[(df_pe["pmh_cancer_therapy"] != 0) |
                  (df_pe["pmh_cancer"].str.contains("within past", case=False, regex=False)) |
                  (df_pe["pred_lymphoma"] != 0) | (df_pe["pred_metastaticcancer"] != 0) |
                  (df_pe["pred_leukemia"] != 0), ["PESI_cancer"]] = 1

df_pe.loc[(df_pe["pmh_cancer_therapy"].isna()) &
                  (df_pe["pmh_cancer"].isna()) &
                  (df_pe["pred_lymphoma"].isna()) &
                  (df_pe["pred_metastaticcancer"].isna()) &
                  (df_pe["pred_leukemia"].isna()), ["PESI_cancer"]] = np.nan


In [52]:
df_pe_ref = df_pe.copy().set_index("patientunitstayid")

# pulse column, midpoint 75 for aps var -> 40 und unter muss ich selbst nachschauen
df_pe["PESI_pulse"] = df_pe["patientunitstayid"].map(
    lambda x: map_vitals_from_two_sources(x, df_pe_ref, "aps_heartrate", "pesi_heartrate_30_median", 109, 41, "above"))

# systolic BP column -> only from me (not in APS) and from the combined vitals
df_pe["PESI_systolic"] = df_pe["pesi_systolic_30_median"]

# Temp < 36 -> from aps_temperature -> 40 and over muss ich selbst heraussuchen
df_pe["PESI_temp"] = df_pe["patientunitstayid"].map(
    lambda x: map_vitals_from_two_sources(x, df_pe_ref, "aps_temperature", "pesi_temperature_30_median", 36, 40, "below"))

# RR <= 30 - from aps_respiratoryrate -> <= 8  muss ich selbst raussuchen
df_pe["PESI_resp"] = df_pe["patientunitstayid"].map(
    lambda x: map_vitals_from_two_sources(x, df_pe_ref, "aps_respiratoryrate", "pesi_respiration_30_median", 29, 9, "above"))

# SpO2 < 90
df_pe["PESI_o2"] = df_pe["pesi_sao2_30_median"]

# AMS aps_verbal, aps_meds
df_pe["PESI_ams"] = np.nan
df_pe.loc[df_pe["aps_verbal"] == 5, ["PESI_ams"]] = 0
df_pe.loc[df_pe["aps_verbal"] < 5, ["PESI_ams"]] = 1
df_pe.loc[df_pe["aps_verbal"].isna(), ["PESI_ams"]] = np.nan


In [53]:
## PESI score and PESI classes
# calculate the score
df_pe["PESI_score"] = df_pe.apply(lambda row: row["age"] + row["PESI_gender"]*10 +
                                              row["PESI_cancer"]*30 + row["PESI_hf"]*10 +
                                              row["PESI_pulm"]*10 + row["PESI_pulse"]*20 +
                                              row["PESI_systolic"]*30 + row["PESI_resp"]*20 +
                                              row["PESI_temp"]*20 + row["PESI_ams"]*60 +
                                              row["PESI_o2"]*20, axis=1)

# calculate the PESI classes
df_pe["PESI_class"] = np.nan
df_pe.loc[df_pe.PESI_score > 125, ["PESI_class"]] = 5
df_pe.loc[(df_pe.PESI_score >= 106) & (df_pe.PESI_score <= 125), ["PESI_class"]] = 4
df_pe.loc[(df_pe.PESI_score >= 86) & (df_pe.PESI_score <= 105), ["PESI_class"]] = 3
df_pe.loc[(df_pe.PESI_score >= 66) & (df_pe.PESI_score <= 85), ["PESI_class"]] = 2
df_pe.loc[df_pe.PESI_score < 66, ["PESI_class"]] = 1

# 5. sPESI score calculation

In [54]:
# sPESI age column
df_pe["sPESI_age"] = df_pe["PESI_age"].map(lambda x: 1 if x > 80 else 0)


# from PESI directly
df_pe["sPESI_systolic"] = df_pe["PESI_systolic"]
df_pe["sPESI_pulse"] = df_pe["PESI_pulse"]
df_pe["sPESI_o2"] = df_pe["PESI_o2"]


In [55]:
# cancer and cardiopulm clms
df_pe["sPESI_cancer"] = df_pe.apply(lambda x: 1 if (x.pmh_cancer_therapy != 0 or 
                                                  x.pred_metastaticcancer > 0 or 
                                                  x.pmh_cancer != 0 or
                                                  x.pred_lymphoma > 0 or 
                                                  x.pred_leukemia > 0) else 0, axis=1)

df_pe["sPESI_cardiopulm"] = df_pe.apply(lambda x: 1 if (x.PESI_pulm > 0 or 
                                                      x.PESI_hf > 0 or 
                                                      x.pmh_MI != 0 or
                                                      x.pmh_pacemaker != 0 or 
                                                      x.pmh_AICD != 0 or
                                                      x.pred_midur > 0 or
                                                      x.pmh_CA_bypass != 0) else 0, axis=1)


In [56]:
df_pe["sPESI_score"] = df_pe["sPESI_age"] + df_pe["sPESI_cancer"] + df_pe["sPESI_cardiopulm"] + \
                                df_pe["sPESI_pulse"] + df_pe["sPESI_systolic"] + df_pe["sPESI_o2"]

# 6. Patient exclusion

In [57]:
df_pe = df_pe.dropna(subset=["gender", "predictedicumortality", "predictedhospitalmortality", "aps_verbal", "hospitaldischargestatus"])

In [58]:
df_pe = df_pe[df_pe['age'] >= 18].copy()

In [59]:
df_pe.shape

(1340, 114)

# 7. Further data processing

## 7.1 Functions


In [64]:
def get_dict_for_categorical_with_0(df, column, printoption=False):
    """
    takes a column and forms a simple 1 to n categorical dictionary from the value counts

    :param df: Dataframe
    :param column: String - column name
    :param printoption: Boolean - whether to print the actual dictionary with value counts
    :return: Dictionary
    """

    values = df[column].value_counts()
    df_val_counts = pd.DataFrame(values).reset_index()
    df_val_counts.columns = ['unique_values', 'counts']
    df_val_counts_2 = df_val_counts.loc[df_val_counts.unique_values != 0, :].copy()
    df_val_counts_2.index = np.arange(1, len(df_val_counts_2) + 1)

    if printoption == True:
        print(df_val_counts_2)

    dict_to_cat = dict(zip(df_val_counts_2["unique_values"], df_val_counts_2.index))
    dict_to_cat[0] = 0

    return dict_to_cat

## 7.2 Data processing

In [60]:
df_pe.gender = df_pe.gender.map({"Female":0, "Male":1})

In [61]:
df_pe = df_pe.rename(columns={"pmh_hemolytic _anemia": "pmh_hemolytic_anemia"})

In [62]:
lst_clms_binary = ["pmh_HT_with_treatment", "pmh_MI", "pmh_angina", "pmh_strokes", "pmh_periph_vasc_disease", "pmh_CA_bypass",
                  "pmh_PCI", "pmh_pacemaker", "pmh_AICD", "pmh_venous_thrombosis", "pmh_asthma", "pmh_hemolytic_anemia",
                  "pmh_aplastic_anemia", "pmh_clotting_disorder", "pmh_hypercoagulable_condition", "pmh_hypothyroidism", "pmh_hyperthyroidism", 
                  "pmh_CHF", "pmh_restrictive_lung_disease", "pmh_card_valvular", 'pmh_home_o2', 'pmh_seizures', 'pmh_dementia', 'pmh_neuromusk_disease',
                  'pmh_intracranial_mass', 'pmh_sickle_cells', 'pmh_liver_cirrhosis', 'pmh_ITP']

for i in lst_clms_binary:
    df_pe.loc[(df_pe[i] != 0) & (df_pe[i].notna()), i] = 1
    
    
dict_clms_to_binary = {
    "pmh_cancer": "pmh_cancer_binary",
    "pmh_insulin_dep_DM" : "pmh_diabetes_binary",
    "pmh_COPD": "pmh_COPD_binary",
    "pmh_arrhythmias": "pmh_arrhythmias_binary",
    "pmh_renal_insuff": "pmh_renal_insuff_binary",
    "pmh_renal_failure": "pmh_renal_failure_binary",
    
    
}

for orig_clm, new_clm in dict_clms_to_binary.items():
    df_pe[new_clm] = df_pe[orig_clm].map(lambda x: 1 if (x!=0 and x!=np.nan) else 0)


In [65]:
def map_pmh_cancer(x):
    if x == 0:
        return 0

    if "Cancer-Primary Site/" in x:
        lst_pmh_cancer = x.split("|")
        lst_site = [i for i in lst_pmh_cancer if "Cancer-Primary Site/" in i]

        if len(lst_site) > 1:
            cancer_site = "multiple"

        else:
            site_description = lst_site[0]
            lst_final = site_description.split("/")
            cancer_site = lst_final[1]

    else:
        cancer_site = "other"

    return cancer_site

df_pe["pmh_cancer_grouped"] = df_pe["pmh_cancer"].map(lambda x: map_pmh_cancer(x))
dict_pmhcancer_to_cat = get_dict_for_categorical_with_0(df_pe, "pmh_cancer_grouped")
df_pe["pmh_cancer_grouped"] = df_pe["pmh_cancer_grouped"].map(dict_pmhcancer_to_cat)

df_pe["pmh_cancer_grouped_v2"] = 0
df_pe.loc[df_pe.pmh_cancer_grouped == 0, ["pmh_cancer_grouped_v2"]] = "No_cancer"
df_pe.loc[df_pe.pmh_cancer_grouped == 1, ["pmh_cancer_grouped_v2"]] = "Respiratory"
df_pe.loc[df_pe.pmh_cancer_grouped == 2, ["pmh_cancer_grouped_v2"]] = "Breast"
df_pe.loc[df_pe.pmh_cancer_grouped == 3, ["pmh_cancer_grouped_v2"]] = "other"
df_pe.loc[df_pe.pmh_cancer_grouped == 4, ["pmh_cancer_grouped_v2"]] = "GI"
df_pe.loc[df_pe.pmh_cancer_grouped == 5, ["pmh_cancer_grouped_v2"]] = "Genitourinary"
df_pe.loc[df_pe.pmh_cancer_grouped == 6, ["pmh_cancer_grouped_v2"]] = "GI"
df_pe.loc[df_pe.pmh_cancer_grouped == 7, ["pmh_cancer_grouped_v2"]] = "Genitourinary"
df_pe.loc[df_pe.pmh_cancer_grouped == 8, ["pmh_cancer_grouped_v2"]] = "other"
df_pe.loc[df_pe.pmh_cancer_grouped == 9, ["pmh_cancer_grouped_v2"]] = "Genitourinary"
df_pe.loc[df_pe.pmh_cancer_grouped == 10, ["pmh_cancer_grouped_v2"]] = "GI"
df_pe.loc[df_pe.pmh_cancer_grouped == 11, ["pmh_cancer_grouped_v2"]] = "other"
df_pe.loc[df_pe.pmh_cancer_grouped == 12, ["pmh_cancer_grouped_v2"]] = "Genitourinary"
df_pe.loc[df_pe.pmh_cancer_grouped == 13, ["pmh_cancer_grouped_v2"]] = "Genitourinary"
df_pe.loc[df_pe.pmh_cancer_grouped == 14, ["pmh_cancer_grouped_v2"]] = "GI"
df_pe.loc[df_pe.pmh_cancer_grouped == 15, ["pmh_cancer_grouped_v2"]] = "GI"
df_pe.loc[df_pe.pmh_cancer_grouped == 16, ["pmh_cancer_grouped_v2"]] = "other"
df_pe.loc[df_pe.pmh_cancer_grouped == 17, ["pmh_cancer_grouped_v2"]] = "other"
df_pe.loc[df_pe.pmh_cancer_grouped == 18, ["pmh_cancer_grouped_v2"]] = "GI"
df_pe.loc[df_pe.pmh_cancer_grouped == 19, ["pmh_cancer_grouped_v2"]] = "other"
df_pe.loc[df_pe.pmh_cancer_grouped == 20, ["pmh_cancer_grouped_v2"]] = "other"
df_pe.loc[df_pe.pmh_cancer_grouped == 21, ["pmh_cancer_grouped_v2"]] = "other"
df_pe.loc[df_pe.pmh_cancer_grouped == 22, ["pmh_cancer_grouped_v2"]] = "Genitourinary"

In [66]:
df_non = df_pe.loc[df_pe.pmh_non_insulin_dep_DM != 0, "patientunitstayid"].copy()
lst_non = df_non.tolist()

df_insulin = df_pe.loc[df_pe.pmh_insulin_dep_DM != 0, "patientunitstayid"].copy()
lst_insulin = df_insulin.tolist()

lst_diff = [x for x in lst_non if x not in lst_insulin]

print(len(lst_diff))
# -> non-insulin_dep is obsolete!

def map_pmh_diabetes(x):
    if x == 0:
        return 0

    if "non-medication" in x:
        return "dm_without_treatment"

    if x == "medication dependent":
        return "medication_only"

    if "insulin" in x and "medication" not in x:
        return "Insulin"

    if "insulin" in x and "medication" in x:
        return "Insulin_and_medication"


df_pe["pmh_diabetes"] = df_pe["pmh_insulin_dep_DM"].map(lambda x: map_pmh_diabetes(x))

0


In [67]:
dict_pmhCOPD = {
    0: 0,
    "COPD  - moderate": "COPD_moderate",
    "COPD  - no limitations": "COPD_mild",
    "COPD  - severe": "COPD_severe",
    "COPD  - moderate|COPD  - severe": "COPD_severe"
}

df_pe["pmh_COPD"] = df_pe["pmh_COPD"].map(dict_pmhCOPD)

In [68]:
def map_pmh_arrhythmias(x):
    if x == 0:
        return 0

    if "atrial fibrillation" in x:
        return "Afib_orwith"

    else:
        return "other_arrhythmia"


df_pe["pmh_arrhythmias"] = df_pe["pmh_arrhythmias"].map(lambda x: map_pmh_arrhythmias(x))

In [69]:
dict_renal_insuff = {
    0: 0,
    "renal insufficiency - creatinine 1-2": "renal_insuff_crea<3",
    "renal insufficiency - creatinine 2-3": "renal_insuff_crea<3",
    "renal insufficiency - creatinine 3-4": "renal_insuff_crea>3",
    "renal insufficiency - creatinine 4-5": "renal_insuff_crea>3",
    "renal insufficiency - baseline creatinine unknown": "renal_insuff_creaunknown"
}
df_pe["pmh_renal_insuff"] = df_pe["pmh_renal_insuff"].map(dict_renal_insuff)

In [70]:
dict_renal_failure = {
    0: 0,
    "renal failure - hemodialysis": "renal_fail_w_dialysis",
    "renal failure- not currently dialyzed": "renal_fail_no_dialysis",
    "renal failure - peritoneal dialysis": "renal_fail_w_dialysis"
}
df_pe["pmh_renal_failure"] = df_pe["pmh_renal_failure"].map(dict_renal_failure)

In [71]:
def map_pmh_PE(x):
    if x == 0:
        return 0

    if "multiple" in x:
        return "multiple_PE"

    else:
        return "single_PE"

df_pe["pmh_PE"] = df_pe["pmh_PE"].map(lambda x: map_pmh_PE(x))

In [72]:
df_pe["pmh_CAD_and_other_large_vessel"] = df_pe.apply(lambda x: 1 if x.pmh_MI == 1 or x.pmh_angina == 1 or x.pmh_strokes == 1 or x.pmh_periph_vasc_disease == 1 
                                                      or x.pmh_CA_bypass == 1 or x.pmh_PCI == 1 else 0, axis=1)

df_pe["pmh_any_pacemaker"] = df_pe.apply(lambda x: 1 if x.pmh_pacemaker == 1 or x.pmh_AICD == 1 else 0, axis=1)

df_pe["pmh_venous_thromb_and_PE"] = df_pe.apply(lambda x: 1 if x.pmh_venous_thrombosis == 1 or (x.pmh_PE!=0 and x.pmh_PE!=np.nan) else 0, axis=1)

df_pe["pmh_obstructive_LD"] = df_pe.apply(lambda x: 1 if (x.pmh_COPD!=0 and x.pmh_COPD!=np.nan) or x.pmh_asthma == 1 else 0, axis=1)

df_pe["pmh_anemias"] = df_pe.apply(lambda x: 1 if x.pmh_hemolytic_anemia == 1 or x.pmh_aplastic_anemia == 1 else 0, axis=1)

df_pe["pmh_clotting_disorders_gen"] = df_pe.apply(lambda x: 1 if x.pmh_clotting_disorder == 1 or x.pmh_hypercoagulable_condition == 1 else 0, axis=1)

df_pe["pmh_thyroid_disease"] = df_pe.apply(lambda x: 1 if x.pmh_hypothyroidism == 1 or x.pmh_hyperthyroidism == 1 else 0, axis=1)

# 8. Export 


In [74]:
df_pe.to_excel("PE_data/PE_processed_data.xlsx", index=False)