Import the necessary python modules

In [2]:
# Import the necessary python modules
import pandas as pd # v.1.4.3
import numpy as np # v.1.23.0

from tqdm import tqdm # v.4.65.0

import math # python v.3.9.0

# Cohort selection


Selecting all ICU-stays that had Pulmonary Embolism (PE) as their main (APACHE) admission diagnosis:

In [3]:
df_pat_big = pd.read_csv("../../eICU_data/patient.csv", low_memory=False)

In [4]:
[i for i in df_pat_big.apacheadmissiondx.value_counts().index.to_numpy() if "Embolus" in i]

['Embolus, pulmonary']

In [5]:
df_pat_big_pe = df_pat_big[df_pat_big.apacheadmissiondx == "Embolus, pulmonary"].copy()
lst_pat = df_pat_big_pe.patientunitstayid.to_list()
len(lst_pat)

1697

&rarr; 1697 ICU-admissions with a primary diagnosis of PE

# Reducing the file-size


As the original file sizes are quite large, we take a copy of all files with just the information of our PE patients. This will speed up data extraction/calculations down the line

## Function

In [6]:
def make_reduced_files(import_path, export_path, notation, lst_ids):
    """
    Takes an import path (folder where all the eICU files are), an export path, a "notation" (will be added behind the file name as an identifier) and a list of ids, and then returns the reduced
    files of eICU for only that population (except for the hospital table).

    :param import_path: String - Folder where all the eICU data is stored.
    
    :param export_path: String - Should only include the folder, not the filename, and should end with "/".

    :param notation: String - Short notation that will be added to each file name for future identification.

    :param lst_ids: List - List of target population patientunitstayids.

    :return: Saves a list of abbreviated dataframes to the specified export_path.
    """

    # List of all tables in the eICU database, except the hospital table as it is not connected to patientunitstayid
    lst_tables = ["admissionDrug", "admissionDx", "allergy", "apacheApsVar", "apachePatientResult", "apachePredVar",
                  "carePlanCareProvider", "carePlanEOL", "carePlanGeneral", "carePlanGoal", "carePlanInfectiousDisease", "customLab",
                  "diagnosis", "infusionDrug", "intakeOutput", "lab", "medication", "microLab", "note",
                  "nurseAssessment", "nurseCare", "nurseCharting", "pastHistory", "patient", "physicalExam",
                  "respiratoryCare", "respiratoryCharting", "treatment", "vitalAperiodic", "vitalPeriodic"]

    # Looping over all tables, selecting only the data pertaining to the cohort of interest and then saving these files in a specified location
    for table in tqdm(lst_tables):
        df_chunk = pd.read_csv(f"{import_path}{table}.csv", chunksize=100000, low_memory=False)
        lst_dataframes = []
        
        for chunk in df_chunk:
            df_temp = chunk[chunk["patientunitstayid"].isin(lst_ids)]
            lst_dataframes.append(df_temp)

        df_small = pd.concat(lst_dataframes)
        df_small.to_csv(f"{export_path}{table}_{notation}.csv", index=False)

## Implementation

In [7]:
make_reduced_files(
    import_path = "../../eICU_data/",
    export_path = "PE_data/",
    notation = "PE",
    lst_ids = lst_pat
)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [07:30<00:00, 15.02s/it]


To make this code work in your environment, the complete unpacked eICU data has to be located at the relative path "../../eICU_data/" in your project.

Now all of the data for our PE patients is in the folder at the relative path "PE_data/" and can be accessed from there.


# Data extraction


## Functions

In [8]:
def get_basic_patient_info(df_pat, lst_ids):
    """
    Takes the patient dataframe of the eICU database, a list of target patientunitstayids and
    returns a dataframe with "cleaned" info per patientunitstayid.

    :param df_pat: DataFrame - Patient dataframe of eICU (or abbreviated). 

    :param lst_ids: List - List of the population patientunitstayids.

    :return: DataFrame with the information.
    """

    lst_columns = ["patientunitstayid", "uniquepid", "gender", "age", "ethnicity", "hospitalid", "wardid",
                   "unittype", "apacheadmissiondx", "admissionheight", "admissionweight", "hospitaldischargestatus",
                   'hospitaladmittime24', "hospitaladmitsource", "hospitaldischargelocation", 'unitadmittime24',
                   'unitadmitsource', 'unitstaytype', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus', "unitvisitnumber"]

    # reducing the general pat_df to reduce the computational load
    df_temp = df_pat.loc[df_pat["patientunitstayid"].isin(lst_ids), lst_columns].copy()

    # gender column
    df_temp["gender"] = df_temp["gender"].replace({"Unknown": np.nan, "Other": np.nan})

    # age column
    df_temp["age"] = df_temp["age"].replace("> 89", "90")
    df_temp["age"] = pd.to_numeric(df_temp["age"])

    # weight columns
    df_temp.loc[(df_temp.admissionweight <= 20) | (df_temp.admissionweight >= 300), ["admissionweight"]] = np.nan
    df_temp.loc[(df_temp.dischargeweight <= 20) | (df_temp.dischargeweight >= 300), ["dischargeweight"]] = np.nan

    # admissionheight
    df_temp.loc[(df_temp.admissionheight < 100) | (df_temp.admissionheight > 210), ["admissionheight"]] = np.nan
    
    # creating the BMI column
    df_temp["BMI"] = df_temp["admissionweight"] / (df_temp["admissionheight"] / 100) ** 2
    df_temp.loc[(df_temp.BMI > 100) | (df_temp.BMI < 12), "BMI"] = np.nan
    
    df_final = df_temp[["patientunitstayid", "gender", "age", "ethnicity", "BMI", "hospitaldischargestatus", "unitdischargestatus"]].copy()

    return df_final

In [9]:
def apply_group_pmh_subcat(x, df, clm_name):
    df_one_id = df.loc[(df.patientunitstayid == x), [clm_name]]
    lst_diagnoses = df_one_id[clm_name].unique().tolist()
    final_string = "|".join(lst_diagnoses)
    return final_string

def apply_split_and_rejoin_for_output(str_pmh):
    lst_strings = str_pmh.split("/")
    lst_final = lst_strings[6:]

    if len(lst_final) == 1:
        return lst_final[0]

    if len(lst_final) > 1:
        joined = "/".join(lst_final)
        return joined

def get_pastHistory(df_pmh, lst_ids):
    """
    Receives the pastHistory Dataframe from eICU or abbreviated and a list of target patientunitstayids and returns
    a kind of longformat dataframe with the most important categories/PMH.

    :param df_pmh: DataFrame - pastHistory dataframe from eICU. 

    :param lst_ids: List - list of patientunitstayids from the target population.

    :return: DataFrame with patientunitstayids as the index and columns for each category of PMH
    """
    
    # Dictionary of prospective column names (key) and the string keys to the PMH-string-path of each disease (values)
    dict_subcat_clms = {
        'pmh_HT_with_treatment': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Hypertension Requiring Treatment'],
        'pmh_cancer': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Cancer'],
        'pmh_non_insulin_dep_DM': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Non-Insulin Dependent Diabetes'],
        'pmh_COPD': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'COPD'],
        'pmh_CHF': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Congestive Heart Failure'],
        'pmh_insulin_dep_DM': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Insulin Dependent Diabetes'],
        'pmh_arrhythmias': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Arrhythmias'],
        'pmh_hypothyroidism': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Hypothyroidism'],
        'pmh_MI': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Myocardial Infarction'],
        'pmh_strokes': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Strokes'],
        'pmh_renal_insuff': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Renal Insufficiency'],
        'pmh_PCI': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Procedural Coronary Intervention'],
        'pmh_card_valvular': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Valve disease'],
        'pmh_asthma': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Asthma'],
        'pmh_liver_cirrhosis': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 'Cirrhosis'],
        'pmh_renal_failure': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Renal Failure'],
        'pmh_CA_bypass': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Coronary Artery Bypass'],
        'pmh_seizures': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Seizures'],
        'pmh_periph_vasc_disease': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Peripheral Vascular Disease'],
        'pmh_home_o2': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Home Oxygen'],
        'pmh_venous_thrombosis': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Venous Thrombosis'],
        'pmh_dementia': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Dementia'],
        'pmh_pacemaker': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Pacemaker'],
        'pmh_cancer_therapy': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Cancer Therapy'],
        'pmh_angina': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Angina'],
        'pmh_peptic_ulcer_disease': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 'Peptic Ulcer Disease'],
        'pmh_TIAs': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'TIAs'],
        'pmh_PFTs': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Pulmonary Function Tests'],
        'pmh_resp_failure': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Respiratory Failure'],
        'pmh_AICD': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'AICD'],
        'pmh_PE': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Pulmonary Embolism'],
        'pmh_RA': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Rheumatoid Arthritis'],
        'pmh_mmunosuppression_last_6m': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'Immunosuppression within past 6 months'],
        'pmh_chronic_kidney_stones': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Chronic Stone Disease'],
        'pmh_neuromusk_disease': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Neuromuscular Disease'],
        'pmh_restrictive_lung_disease': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Restrictive Disease'],
        'pmh_s_p_NTx': ['notes/Progress Notes/Past History/Organ Systems/Renal', 's_p Renal Transplant'],
        'pmh_HIV_only': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'HIV only'],
        'pmh_hemolytic _anemia': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Hemolytic Anemia'],
        'pmh_SLE': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'SLE'],
        'pmh_exercise_tolerance': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Exercise Tolerance'],
        'pmh_intracranial_mass': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Intracranial Mass'],
        'pmh_hyperthyroidism': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Hyperthyroidism'],
        'pmh_recent_steroids_>10d': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Recent Steroid Use for > 10 days'],
        'pmh__petite_mal_seizures': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Seizures_petite mal seizures'],
        'pmh_s_p_LTx': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 's_p Liver Transplant'],
        'pmh_hypercoagulable_condition': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Hypercoagulable Condition'],
        'pmh_ITP': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'ITP'],
        'pmh_neurogenic_bladder': ['notes/Progress Notes/Past History/Organ Systems/Renal ', 'Neurogenic Bladder'],
        'pmh_sickle_cells': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Sickle Cell Disease'],
        'pmh_clotting_disorder': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Clotting Disorder'],
        'pmh_AIDS': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'AIDS'],
        'pmh_sarcoidosis': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Sarcoidosis'],
        'pmh_vasculitis': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Vasculitis'],
        'pmh_myeloproliferative_disease': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Myeloproliferative Disease'],
        'pmh_s_p_HTx': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 's_p Heart Transplant'],
        'pmh_aplastic_anemia': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Aplastic Anemia'],
        'pmh_hypercalcemia': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Hypercalcemia'],
        'pmh_hypersplenism': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 'Hypersplenism'],
        'pmh_scleroderma': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Scleroderma'],
        'pmh_RTA': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'RTA'],
        'pmh_s_p_lungTx': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 's_p Lung Transplant'],
        "pmh_cushings": ["notes/Progress Notes/Past History/Organ Systems/Endocrine", "Cushing's Syndrome"],
        'pmh_dermato': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Dermato']
    }
    
    # reducing the general pastHistory df to reduce the computational load
    df_temp = df_pmh.loc[df_pmh["patientunitstayid"].isin(lst_ids), ["patientunitstayid", "pasthistorypath", "pasthistoryvalue"]].copy()

    # Replace substrings in pasthistorypath (which would cause issues due to regex expressions later on)
    df_temp["pasthistorypath"] = df_temp["pasthistorypath"].str.replace("Hematology/Oncology", "Hematology-Oncology",
                                                                        regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("s/p", "s_p",
                                                                      regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("TIA(s)", "TIAs",
                                                                      regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("HIV (only)", "HIV only",
                                                                      regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("Recent Steroid Use (for > 10 days)", "Recent Steroid Use for > 10 days",
                                                                      regex=True)

    # list for the future columns
    lst_columns = []

    # Iterate over the individual diseases and check whether a patient has this PMH or not then saving this as a column to the list
    for clm_name, key_phrases in tqdm(dict_subcat_clms.items()):
        key1 = key_phrases[0]
        key2 = key_phrases[1]

        df_new_clm_raw = df_temp.loc[(df_temp['pasthistorypath'].str.contains(key1, na=False)) &
                                     (df_temp['pasthistorypath'].str.contains(key2, na=False)), ["patientunitstayid", "pasthistorypath"]].copy().drop_duplicates()

        df_new_clm_raw["output_pmh_path"] = df_new_clm_raw["pasthistorypath"].apply(lambda x: apply_split_and_rejoin_for_output(x))

        df_new_clm_reference = df_new_clm_raw.copy()

        df_new_clm_raw[clm_name] = df_new_clm_raw["patientunitstayid"].apply(lambda x: apply_group_pmh_subcat(x, df_new_clm_reference, "output_pmh_path"))

        df_pat_w_data = df_new_clm_raw.loc[:, ["patientunitstayid", clm_name]].copy().drop_duplicates()

        lst_pat_w_data = list(df_new_clm_raw.patientunitstayid.unique())
        lst_pat_no_data = [x for x in lst_ids if x not in lst_pat_w_data]

        df_pat_without_data = pd.DataFrame(lst_pat_no_data, columns=['patientunitstayid'])
        df_pat_without_data[clm_name] = 0

        df_clm_final = pd.concat([df_pat_w_data, df_pat_without_data])
        df_clm_final = df_clm_final.set_index("patientunitstayid")

        lst_columns.append(df_clm_final)

    # Concatenate all the columns and export this with the ids as another column  
    df_final = pd.concat(lst_columns, axis=1)
    df_final = df_final.reset_index()
    df_final = df_final[['patientunitstayid', 'pmh_HT_with_treatment', 'pmh_cancer',
       'pmh_non_insulin_dep_DM', 'pmh_COPD', 'pmh_CHF', 'pmh_insulin_dep_DM',
       'pmh_arrhythmias', 'pmh_MI', 'pmh_strokes', "pmh_hypothyroidism", 
       'pmh_renal_insuff', 'pmh_PCI', 'pmh_card_valvular', 'pmh_asthma',
       'pmh_liver_cirrhosis', 'pmh_renal_failure', 'pmh_CA_bypass',
       'pmh_seizures', 'pmh_periph_vasc_disease', 'pmh_home_o2',
       'pmh_venous_thrombosis', 'pmh_dementia', 'pmh_pacemaker',
       'pmh_cancer_therapy', 'pmh_angina', 
       'pmh_TIAs', 'pmh_resp_failure', 'pmh_AICD', 'pmh_PE',
       'pmh_neuromusk_disease', 'pmh_restrictive_lung_disease', 'pmh_hemolytic _anemia', 'pmh_intracranial_mass',
       'pmh_hyperthyroidism', 'pmh__petite_mal_seizures', 'pmh_hypercoagulable_condition', 'pmh_ITP', 
       'pmh_sickle_cells', 'pmh_clotting_disorder','pmh_aplastic_anemia', 'pmh_s_p_lungTx']]

    return df_final

In [10]:
def get_and_initial_clean_apacheApsVar(df_apsVar, lst_ids):
    """
    Receives the apacheApsVar Dataframe of the eICU database or abbreviated and a list of target patientunitstayids and
    returns a dataframe with initially "cleaned" data: The -1 Values in several columns which denote for "no data was
    entered" were set to np.nan. Additionally, a GCS column was added.

    :param df_apsVar: DataFrame - apacheApsVar Dataframe from eICU or abreviated. 

    :param lst_ids: List - list of target patientunitstayids.

    :return: DataFrame with patientunitstayid as the index and the data
    """
   
    # reducing the general df to reduce the computational load
    df_temp = df_apsVar.loc[df_apsVar["patientunitstayid"].isin(lst_ids), ['patientunitstayid', 'dialysis',
                                 'eyes', 'motor', 'verbal', 'meds',  'temperature',
                                 'respiratoryrate',  'heartrate', 'meanbp']].copy()
    
    # set the missing data to NaN
    lst_clms = ['eyes', 'motor', 'verbal', 'meds', 'temperature',
                'respiratoryrate',  'heartrate', 'meanbp']
    for clm in lst_clms:
        df_temp.loc[df_temp[clm] == -1, [clm]] = np.nan
    
    # create the GCS column
    df_temp["GCS"] = df_temp.eyes + df_temp.motor + df_temp.verbal

    # add the patients that did not have any data at all 
    lst_pat_with_data = list(df_temp.patientunitstayid.unique())
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]
    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=["patientunitstayid", 'dialysis',
                                                'eyes', 'motor', 'verbal', 'meds', 'temperature',
                                                'respiratoryrate', 'heartrate', 'meanbp', "GCS"])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data
    df_final = pd.concat([df_temp, df_pat_without_data])
                       
    # Rename columns
    rename_clms = ['dialysis', 'eyes', 'motor', 'verbal', 'meds', 'temperature', 'respiratoryrate', 'heartrate', 'meanbp', "GCS"]
    df_final = df_final.rename(columns={clm: "aps_{}".format(clm) for clm in rename_clms})

    return df_final

In [11]:
def get_cleaned_apachePredVar_basics(df_predvar, lst_ids):
    """
    This function takes the apachePredVar Dataframe (or abbreviated) and a list of patientunitstayids and returns a 
    cleaned Dataframe with the information for those unitstays. The dataframe will have the ids as index.

    :param df_predvar: DataFrame - apachePredVar Dataframe or abbreviated. 

    :param lst_ids: List - List of patientunitstayids of the target population.

    :return: DataFrame with patientunitstayid as the index and the data
    """
    
    # reducing the general df to reduce the computational load
    df_temp = df_predvar.loc[df_predvar["patientunitstayid"].isin(lst_ids), ['patientunitstayid', 'thrombolytics',
               'hepaticfailure', 'lymphoma', 'metastaticcancer', 'leukemia',
               'midur', 'oobintubday1']].copy()
    
    # add the patients that did not have any data at all 
    lst_pat_with_data = list(df_temp.patientunitstayid.unique())
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]
    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=['patientunitstayid', 'thrombolytics',
                                               'hepaticfailure', 'lymphoma', 'metastaticcancer', 'leukemia',
                                               'midur', 'oobintubday1'])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data
    df_final = pd.concat([df_temp, df_pat_without_data])
        
    # Rename columns
    rename_clms = ['thrombolytics','hepaticfailure', 'lymphoma', 'metastaticcancer', 'leukemia', 'midur', 'oobintubday1']
    df_final = df_final.rename(columns={clm: "pred_{}".format(clm) for clm in rename_clms})

    return df_final

In [12]:
def get_cleaned_apachePatientResult_basics(df_apacheresult, lst_ids):
    """
    This function takes the apachePatientResult Dataframe (or abbreviated) and a list of patientunitstayids and returns a 
    cleaned Dataframe with the information for those unitstays. The dataframe will have the ids as index.

    :param df_apacheresult: DataFrame - apachePatientResult Dataframe or abbreviated. 

    :param lst_ids: List - List of patientunitstayids of the target population.

    :return: DataFrame with patientunitstayid as the index and the data
    """
    
    # reducing the general df to reduce the computational load
    df_temp = df_apacheresult.loc[(df_apacheresult["apacheversion"] == "IVa") & (df_apacheresult["patientunitstayid"].isin(lst_ids)),
                             ['patientunitstayid', 'acutephysiologyscore', 'apachescore','actualiculos', 'predictedhospitalmortality',"predictedicumortality", 'unabridgedhosplos']].copy()

    # set the missing data to NaN
    lst_minus_ones = ['acutephysiologyscore', 'apachescore',"predictedicumortality", 'predictedhospitalmortality']
    for clm in lst_minus_ones:
        df_temp.loc[df_temp[clm] == -1, [clm]] = np.nan

    # add the patients that did not have any data at all 
    lst_pat_with_data = list(df_temp.patientunitstayid.unique())
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]
    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=['patientunitstayid', 'acutephysiologyscore', 'apachescore', 'actualiculos', 'predictedhospitalmortality', "predictedicumortality", 'unabridgedhosplos'])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data
    df_final = pd.concat([df_temp, df_pat_without_data])

    return df_final


In [1]:

def get_infusion_drugs_prototype(df_infusion, lst_ids, additional_dict=None, timeframe=(0, 1440)):
    """
    This function returns a dataframe indicating whether patients received certain medication classes within a specified timeframe.

    :param df_infusion: DataFrame - infusionDrug Dataframe from eICU. 

    :param lst_ids: List - List of patientunitstayids of the target population.

    :param additional_dict: Dict - Additional/custom dictionary with the "clm_name": "drugname|drugname|drugname" format.

    :param timeframe: Tuple, default is (0, 1440) - (lower offset, upper offset), offset bounds.

    :return: DataFrame with the columns of who got which medication.
    """

    lower_offset, upper_offset = timeframe

    # reducing the general df to reduce the computational load
    df_reduced = df_infusion.query("patientunitstayid in @lst_ids and @lower_offset <= infusionoffset <= @upper_offset").copy()

    df_reduced.drugname = df_reduced.drugname.str.lower()

    dict_drugs = {
        "infusion_vaso_ino": 'epinephrine|adrenaline|norepinephrine|levophed|dobutamine|dobutrex|vasopressin|isoprotenerol|isuprel|phenylephrine|neo-synephrine|dopamine|milrinone',
        "infusion_thrombolytic": "alteplase|activase|tpa|altaplase|altepase"}

    if additional_dict is not None:
        dict_drugs.update(additional_dict)

    lst_columns = []
    
    # loop over the the drug groups and look at whether patients received this as an infusion or not
    for clm_name, drug_str in dict_drugs.items():

        lst_pat_drug = df_reduced.loc[df_reduced.drugname.str.contains(drug_str), "patientunitstayid"].copy().unique().tolist()
        lst_pat_wo_drug = [i for i in lst_ids if i not in lst_pat_drug]

        df_pat_w_data = pd.DataFrame(lst_pat_drug, columns=['patientunitstayid'])
        df_pat_w_data[clm_name] = 1

        df_pat_without_data = pd.DataFrame(lst_pat_wo_drug, columns=['patientunitstayid'])
        df_pat_without_data[clm_name] = 0

        df_clm_final = pd.concat([df_pat_w_data, df_pat_without_data])
        df_clm_final = df_clm_final.set_index("patientunitstayid")

        lst_columns.append(df_clm_final)

    df_final = pd.concat(lst_columns, axis=1)

    return df_final

In [14]:
def fast_clean_data(array_vitals):
    """
    Discard outliers that are > (median + 2 IQR) or < (median - 2 IQR)

    :param lst_values: List - list of values (eg. mean BPs from a certain timeframe of a patient)

    :return: Array - numpy array of cleaned values

    """

    # to detect outliers for the patients own baseline:
    q25, q50, q75 = np.percentile(array_vitals, [25, 50, 75])
    iqr = q75 - q25

    cleaned_array = array_vitals[(array_vitals >= (q50 - 2 * iqr)) & (array_vitals <= (q50 + 2 * iqr))].copy()

    return cleaned_array


def groupby_vitals_per_hour(x, agg_timeunit):
    """
    Discard outliers (apply fast_clean_data function ()) and then summarize the data in this timeframe as a
    single value per patient
    """
    np_hourly_vitals = x.to_numpy()

    cleaned_hourly = fast_clean_data(np_hourly_vitals)

    if agg_timeunit == "median":
        hour_vital = np.median(cleaned_hourly)
    if agg_timeunit == "max":
        hour_vital = np.max(cleaned_hourly)
    if agg_timeunit == "min":
        hour_vital = np.min(cleaned_hourly)

    return hour_vital


def groupby_vitals_total(x, agg_total):
    """
    Summarize the data in this timeframe as a single value per patient
    """
    np_total_vitals = x.to_numpy()

    if agg_total == "median":
        total_vital = np.median(np_total_vitals)
    if agg_total == "max":
        total_vital = np.max(np_total_vitals)
    if agg_total == "min":
        total_vital = np.min(np_total_vitals)

    return total_vital


def fast_vitals_periodic(df_periodic, lst_ids, vital_name, realistic_bounds, offset, agg_timeunit="median",
                         agg_total="median", timeunit=60, temp_nurse_bool=False, temp_nursechart=None):
    """
        Takes the vitalPeriodic table of eICU, target patientunitstayids as a list, offset bounds, and realistic bounds and a vitalname and
        returns the aggregated and cleaned value for that offset timeframe. There are different options to choose from on how to aggregate that
        value. If looking at the temperature, there is the option to include all temperature measurements from the nurses charting (which often 
        primarily includes the temperature values of a patient. Pulsepressure is calculated from the respective BP_sys - BP_dia. (will 
        automatically use 20/300 (systolic) and 10/200 (diastolic) as bounds)
        Accepted vitalnames: temperature, sao2, heartrate, respiration, cvp, etco2, systemicsystolic, systemicdiastolic, systemicmean, pasystolic, 
        padiastolic, pamean, icp

        :param df_periodic: Dataframe - abbreviated (! unless a lot of free RAM) Dataframe of the vitalPeriodic table of eICU

        :param lst_ids: List - list of patientunitstayids of the target population

        :param vital_name: String - Column name of the vital 

        :param realistic_bounds: Tuple - realistic values of the vital in the form of (lower bound, upper bound) eg. (20,100) for fio2

        :param offset: Tuple - (lower time offest, upper time offset)

        :param agg_timeunit: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values per timeunit (if the
                offset duration is longer than the timeunit)

        :param agg_total: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values to the final value
        
        :param timeunit: Int, preset 60 - Number of minutes for the timeunit
        
        :param temp_nurse_bool: Boolean, preset False - Whether to use the nurse charting for the temperature
        
        :param temp_nursechart: Dataframe - NurseCharting Dataframe of the eICU if the vital_name is temperature and temp_nurse_bool is True

        :return: Dataframe 
        """

    # reducing the general df to reduce the computational load
    df_reduced = df_periodic[df_periodic["patientunitstayid"].isin(lst_ids)].copy()

    # open the tuples of the bounds 
    lower_realistic_bound, upper_realistic_bound = realistic_bounds
    lower_offset, upper_offset = offset

    # next reduction, incorporating the big offset bounds and the realistic bounds
    df_temp = df_reduced.loc[
        (df_reduced[vital_name] >= lower_realistic_bound) & (df_reduced[vital_name] <= upper_realistic_bound) &
        (df_reduced["observationoffset"] >= lower_offset) & (df_reduced["observationoffset"] <= upper_offset),
        ["patientunitstayid", "observationoffset", vital_name]].copy().dropna()

    # calculate the pulsepressure if this is selected; via systolic BP - diastolic BP
    if vital_name == "pulsepressure":
        df_temp_initial = df_reduced.loc[(df_reduced["systemicsystolic"] >= 20) & (df_reduced["systemicsystolic"] <= 300) &
                                         (df_reduced["systemicdiastolic"] >= 10) & (df_reduced["systemicdiastolic"] <= 200) &
                                         (df_reduced["observationoffset"] >= lower_offset) & (
                                                 df_reduced["observationoffset"] <= upper_offset),
                                         ["patientunitstayid", "observationoffset", "systemicsystolic", "systemicdiastolic"]].copy()

        df_temp = (df_temp_initial.assign(pulsepressure=df_temp_initial["systemicsystolic"] - df_temp_initial["systemicdiastolic"])
                   .drop(columns=["systemicsystolic", "systemicdiastolic"])
                   .dropna()
                   )
        
    # add the temperature information from the nurse chart if temperature is selected and this feature is used
    if vital_name == "temperature" and temp_nurse_bool:
        # reduce the nurse charting to lessen the computational load
        df_nursechart = temp_nursechart[temp_nursechart["patientunitstayid"].isin(lst_ids)].copy()
        df_n_char_red = df_nursechart.loc[(df_nursechart.nursingchartoffset <= upper_offset) &
                                          (df_nursechart.nursingchartoffset >= lower_offset) &
                                          (df_nursechart.nursingchartcelltypevallabel == "Temperature") &
                                          (df_nursechart.nursingchartcelltypevalname.isin(
                                              ['Temperature (C)', 'Temperature (F)'])),
                                          ["patientunitstayid", "nursingchartoffset", "nursingchartcelltypevalname",
                                           "nursingchartvalue"]].copy()
        
        # adjust for the fact that some values are taken as °F and some as °C
        df_n_char_red.nursingchartvalue = df_n_char_red.nursingchartvalue.astype(float)
        df_n_char_red["temperature"] = df_n_char_red.apply(lambda x: ((x.nursingchartvalue - 32) * (
                    5 / 9)) if x.nursingchartcelltypevalname == 'Temperature (F)' else x.nursingchartvalue, axis=1)
        df_n_char_red = df_n_char_red.rename(columns={"nursingchartoffset": "observationoffset"})
        df_n_char_final = df_n_char_red.loc[(df_n_char_red[vital_name] >= lower_realistic_bound) & (
                    df_n_char_red[vital_name] <= upper_realistic_bound),
                                            ["patientunitstayid", "observationoffset",
                                             vital_name]].copy().reset_index(drop=True)

        df_temp = pd.concat([df_temp, df_n_char_final])

    # create bins for the timeunit in the given offset interval and then cut the column to these bins
    lst_bins = [i for i in range(lower_offset, upper_offset + 1, timeunit) if i <= upper_offset]
    
    if (upper_offset - lower_offset) % timeunit != 0: 
        lst_bins.append(upper_offset)

    df_temp.observationoffset = pd.cut(df_temp.observationoffset, bins=lst_bins, right=True, include_lowest=True)

    # group first by hour (cleaning the data of outliers) and then over the total offset span
    df_grouped_hours = (df_temp
                        .groupby(["patientunitstayid", "observationoffset"])
                        .agg(lambda x: groupby_vitals_per_hour(x, agg_timeunit=agg_timeunit))
                        .reset_index()
                        .drop(columns=["observationoffset"])
                        .dropna()
                        .groupby(["patientunitstayid"])
                        .agg(lambda x: groupby_vitals_total(x, agg_total=agg_total))
                        .reset_index()
                        )

    # make a dataframe with a single column containing the ids and then merge the results to that
    df_pat = pd.DataFrame({'patientunitstayid': lst_ids})
    clm_name = "{}_{}_{}to{}_u{}".format(vital_name, agg_total, lower_offset, upper_offset, timeunit)
    df_pat_final = df_pat.merge(df_grouped_hours, on="patientunitstayid", how="left").rename(columns={vital_name: clm_name})

    return df_pat_final

In [15]:
def fast_vitals_combined(df_periodic, df_aperiodic, lst_ids, vital_name, realistic_bounds, offset, agg_timeunit="median",
                         agg_total="median", timeunit=60):
    """
    Takes the both the vitalPeriodic and Aperiodic tables of eICU, target patientunitstayids as a list, offset bounds, and realistic bounds and a vitalname and
    returns the aggregated and cleaned value for that offset timeframe. There are different options to choose from on how to aggregate that
    value. Possible vitalnames: systolic, diastolic, mean_bp, pulsepressure

    :param df_periodic: Dataframe - vitalPeriodic Dataframe of the eICU Database (abbreviated unless a lot of free RAM)

    :param df_aperiodic: Dataframe - vitalAperiodic Dataframe (abbreviated unless a lot of free RAM)

    :param lst_ids: List - list of patientunitstayids of the target population

    :param vital_name: String, picklist - options "systolic", "diastolic", "mean_bp", "pulsepressure"

    :param realistic_bounds: Tuple - realistic values of the vital in the form of (lower bound, upper bound) eg. (20, 250) for systolic

    :param offset: Tuple - (lower time offest, time upper offset)

    :param agg_timeunit: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values per timeunit
    (if the offset-duration is longer than the timeunit)

    :param agg_total: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values to the final value
    
    :param timeunit: Int, preset 60 - Number of minutes for the timeunit

    :return: Dataframe 
    """

    # dictionary that sorts the vital_names to the respective names of the columns in the periodic and Aperiodic dfs
    dict_vitalnames = {
        "systolic": ("systemicsystolic", "noninvasivesystolic"),
        "diastolic": ("systemicdiastolic", "noninvasivediastolic"),
        "mean_bp": ("systemicmean", "noninvasivemean")
    }

    # reducing the general dfs to reduce the computational load
    df_reduced_periodic = df_periodic[df_periodic["patientunitstayid"].isin(lst_ids)].copy()
    df_reduced_aperiodic = df_aperiodic[df_aperiodic["patientunitstayid"].isin(lst_ids)].copy()

    # open the tuples
    lower_realistic_bound, upper_realistic_bound = realistic_bounds
    lower_offset, upper_offset = offset

    # next reduction, incorporating the big offset bounds and the realistic bounds into each dataframe and then adding both
    # datafrems to have one united dataframes with all the vitals values
    # calculate the pulsepressure if this is selected; via systolic BP - diastolic BP
    if vital_name == "pulsepressure":
        df_temp_initial_periodic = df_reduced_periodic.loc[
            (df_reduced_periodic["systemicsystolic"] >= 20) & (df_reduced_periodic["systemicsystolic"] <= 300) &
            (df_reduced_periodic["systemicdiastolic"] >= 10) & (df_reduced_periodic["systemicdiastolic"] <= 200) &
            (df_reduced_periodic["observationoffset"] >= lower_offset) & (
                    df_reduced_periodic["observationoffset"] <= upper_offset),
            ["patientunitstayid", "observationoffset", "systemicsystolic", "systemicdiastolic"]].copy()

        df_temp_periodic = (df_temp_initial_periodic.assign(
            pulsepressure=df_temp_initial_periodic["systemicsystolic"] - df_temp_initial_periodic["systemicdiastolic"])
                            .drop(columns=["systemicsystolic", "systemicdiastolic"])
                            .dropna()
                            )

        df_temp_initial_aperiodic = df_reduced_aperiodic.loc[
            (df_reduced_aperiodic["noninvasivesystolic"] >= 20) & (df_reduced_aperiodic["noninvasivesystolic"] <= 300) &
            (df_reduced_aperiodic["noninvasivediastolic"] >= 10) & (df_reduced_aperiodic["noninvasivediastolic"] <= 200) &
            (df_reduced_aperiodic["observationoffset"] >= lower_offset) & (
                    df_reduced_aperiodic["observationoffset"] <= upper_offset),
            ["patientunitstayid", "observationoffset", "noninvasivesystolic", "noninvasivediastolic"]].copy()

        df_temp_aperiodic = (df_temp_initial_aperiodic.assign(
            pulsepressure=df_temp_initial_aperiodic["noninvasivesystolic"] - df_temp_initial_aperiodic["noninvasivediastolic"])
                             .drop(columns=["noninvasivesystolic", "noninvasivediastolic"])
                             .dropna()
                             )
        
    # path for systolic, diastolic and mean_bp
    else:
        key_periodic, key_aperiodic = dict_vitalnames[vital_name]
        df_temp_periodic = df_reduced_periodic.loc[
            (df_reduced_periodic[key_periodic] >= lower_realistic_bound) & (df_reduced_periodic[key_periodic] <= upper_realistic_bound) &
            (df_reduced_periodic["observationoffset"] >= lower_offset) & (df_reduced_periodic["observationoffset"] <= upper_offset),
            ["patientunitstayid", "observationoffset", key_periodic]].copy().dropna().rename(columns={key_periodic: vital_name})
        df_temp_aperiodic = df_reduced_aperiodic.loc[
            (df_reduced_aperiodic[key_aperiodic] >= lower_realistic_bound) & (
                        df_reduced_aperiodic[key_aperiodic] <= upper_realistic_bound) &
            (df_reduced_aperiodic["observationoffset"] >= lower_offset) & (df_reduced_aperiodic["observationoffset"] <= upper_offset),
            ["patientunitstayid", "observationoffset", key_aperiodic]].copy().dropna().rename(columns={key_aperiodic: vital_name})

    df_temp_whole = pd.concat([df_temp_periodic, df_temp_aperiodic])

    # create bins for the timeunit in the given offset interval and then cut the column to these bins
    lst_bins = [i for i in range(lower_offset, upper_offset + 1, timeunit) if i <= upper_offset]
    
    if (upper_offset - lower_offset) % timeunit != 0:
        lst_bins.append(upper_offset)

    df_temp_whole.observationoffset = pd.cut(df_temp_whole.observationoffset, bins=lst_bins, right=True, include_lowest=True)

    # group first by hour (cleaning the data of outliers) and then over the total offset span
    df_grouped_all = (df_temp_whole
                      .groupby(["patientunitstayid", "observationoffset"])
                      .agg(lambda x: groupby_vitals_per_hour(x, agg_timeunit=agg_timeunit))
                      .reset_index()
                      .drop(columns=["observationoffset"])
                      .dropna()
                      .groupby(["patientunitstayid"])
                      .agg(lambda x: groupby_vitals_total(x, agg_total=agg_total))
                      .reset_index()
                      )

    # make a dataframe with a single column containing the ids and then merge the results to that
    df_pat = pd.DataFrame({'patientunitstayid': lst_ids})
    clm_name = "{}_{}_{}to{}_u{}".format(vital_name, agg_total, lower_offset, upper_offset, timeunit)
    df_pat_final = df_pat.merge(df_grouped_all, on="patientunitstayid", how="left").rename(columns={vital_name: clm_name})

    return df_pat_final



In [16]:
def apply_time_to_death_from_unit_admit(x):
    # if the patient did not die, return NaN
    if x.hospitaldischargestatus == "Alive":
        return np.nan
    
    # if the patient died in the ICU then the time on ICU is the time-to-death
    if x.unitdischargestatus == "Expired":
        return x.actualiculos
    
    # if the patient died in the hospital calculate the time to death by subtracting the time to ICU admission
    if x.hospitaldischargestatus == "Expired":
        time_to_death = x.unabridgedhosplos - (x.hospitaladmitoffset/(24*60))
        return time_to_death
    
    else:
        return np.nan
    

def time_to_death_from_unit_admission(df_pat, df_apache_res, lst_ids):
    """
    Compute the time to death from ICU-admission for a cohort of interest. Will return np.nan if the patient did not die and 
    for irregularities.
    
    :param df_pat: Dataframe - patient dataframe of the eICU or abbreviated
    
    :param df_apache_res: Dataframe - apachePatientResult dataframe of the eICU or abbreviated
    
    :param lst_ids: List - List of patientunitstayids
    
    return: Dataframe with 2 columns (patientunitstayids and time_to_death_unitadmit)
    """
    
    # reducing the general dfs to reduce the computational load
    df_pat_red = df_pat.loc[df_pat.patientunitstayid.isin(lst_ids), 
                           ["patientunitstayid", "hospitaldischargestatus", "unitdischargestatus", "hospitaladmitoffset"]].copy()
    df_apache_res_red = df_apache_res.loc[df_apache_res.patientunitstayid.isin(lst_ids),
                                         ["patientunitstayid", 'actualiculos', 'unabridgedhosplos']].copy()
    
    # merging both to have all these columns ready for the apply method
    df_combined = pd.merge(left=df_pat_red, right=df_apache_res_red, how="left", on="patientunitstayid")
    df_combined = df_combined.drop_duplicates()
    
    # calculate the time-to-death in a row-wise approach
    df_combined["time_to_death_unitadmit"] = df_combined.apply(lambda x: apply_time_to_death_from_unit_admit(x), axis=1)
    
    df_final = df_combined[["patientunitstayid", "time_to_death_unitadmit"]].copy()
    
    return df_final

## Demographic Data

In [17]:
df_pat = pd.read_csv("PE_data/patient_PE.csv", low_memory=False)

In [18]:
df_patinfo_PE = get_basic_patient_info(df_pat, lst_pat)

In [19]:
# Process the result (aggregate the ethnicity column as in, only keep the top 2 ethnicities as distinct categories; int-binarize the death columns)
df_patinfo_PE.loc[~(df_patinfo_PE.ethnicity.isin(df_patinfo_PE.ethnicity.value_counts().index[:2].to_list())), "ethnicity"] = "Other/Unknown"
df_patinfo_PE.ethnicity = df_patinfo_PE.ethnicity.fillna("Other/Unknown")
df_patinfo_PE.hospitaldischargestatus = df_patinfo_PE.hospitaldischargestatus.map({"Expired": 1, "Alive": 0})
df_patinfo_PE.unitdischargestatus = df_patinfo_PE.unitdischargestatus.map({"Expired": 1, "Alive": 0})

## Comorbidities 

In [20]:
df_pmh = pd.read_csv("PE_data/pastHistory_PE.csv", low_memory=False)

In [21]:
df_pmhinfo = get_pastHistory(df_pmh, lst_pat)

100%|██████████████████████████████████████████████████████████████████████████████████| 64/64 [00:03<00:00, 20.22it/s]


In [22]:
# Merge the dataframe with the Comorbidities to the dataframe with the demographic data
df_pe = pd.merge(
    left=df_patinfo_PE,
    right=df_pmhinfo,
    how="left",
    on="patientunitstayid"
)

## APACHE IVa, APS and associated variables

In [23]:
df_apsVar = pd.read_csv("PE_data/apacheApsVar_PE.csv", low_memory=False)
df_predVar = pd.read_csv("PE_data/apachePredVar_PE.csv", low_memory=False)
df_apacheresult = pd.read_csv("PE_data/apachePatientResult_PE.csv", low_memory=False)

In [24]:
df_apsvarinfo = get_and_initial_clean_apacheApsVar(df_apsVar, lst_pat)
df_predvarinfo = get_cleaned_apachePredVar_basics(df_predVar, lst_pat)
df_apacheresult_info = get_cleaned_apachePatientResult_basics(df_apacheresult, lst_pat)

In [25]:
# Merge the results dataframes to the growing final dataframe
df_pe = df_pe.merge(
    right=df_apsvarinfo,
    how="left",
    on="patientunitstayid"
)

df_pe = df_pe.merge(
    right=df_predvarinfo,
    how="left",
    on="patientunitstayid"
)

df_pe = df_pe.merge(
    right=df_apacheresult_info,
    how="left",
    on="patientunitstayid"
)

## Infusions

In [26]:
df_infusions = pd.read_csv("PE_data/infusionDrug_PE.csv", low_memory=False)

In [27]:
df_infusions_info = get_infusion_drugs_prototype(df_infusions, lst_pat).reset_index()

In [28]:
# Merge the results dataframe to the growing final dataframe
df_pe = df_pe.merge(
    right=df_infusions_info,
    how="left",
    on="patientunitstayid"
)

## Own vitals

In [29]:
df_periodic = pd.read_csv("PE_data/vitalPeriodic_PE.csv", low_memory=False)
df_a_periodic = pd.read_csv("PE_data/vitalAperiodic_PE.csv", low_memory=False)
df_nurseCharting = pd.read_csv("PE_data/nurseCharting_PE.csv", low_memory=False)

In [30]:
dict_vitals = {
    "heartrate": [(20, 200), "max"],
    "temperature": [(32, 43), "min"],
    "respiration": [(3, 80), "max"],
    "sao2": [(50, 100), "min"]
}

dict_vitals_combined = {
    "systolic": [(20, 250), "min"]
}

lst_clms_pesi_vitals = []

# for the "periodic" vitals
for key, values in tqdm(dict_vitals.items()):
    realistic = values[0]
    direction = values[1]
    
    clm_name = "pesi_{}_own".format(key)
    internal_clm_name = "{}_{}_0to1440_u30".format(key, direction)
    
    if key == "temperature":
        df_temp = fast_vitals_periodic(df_periodic=df_periodic, 
                                       lst_ids=lst_pat, 
                                       vital_name=key, 
                                       realistic_bounds=realistic, 
                                       offset=(0, 1440), 
                                       agg_total=direction, 
                                       timeunit=30, 
                                       temp_nurse_bool=True, 
                                       temp_nursechart=df_nurseCharting)
            
    else:
        df_temp = fast_vitals_periodic(df_periodic=df_periodic, 
                                       lst_ids=lst_pat, 
                                       vital_name=key, 
                                       realistic_bounds=realistic, 
                                       offset=(0, 1440), 
                                       agg_total=direction, 
                                       timeunit=30)

    df_clm = df_temp.rename(columns={internal_clm_name: clm_name})
    lst_clms_pesi_vitals.append(df_clm)

# for vital "systolic" that uses both the periodic and aperiodic datafiles
for key, values in tqdm(dict_vitals_combined.items()):
    realistic = values[0]
    direction = values[1]

    clm_name = "pesi_{}_own".format(key)
    internal_clm_name = "{}_{}_0to1440_u30".format(key, direction)

    df_temp = fast_vitals_combined(df_periodic=df_periodic, 
                                   df_aperiodic=df_a_periodic,
                                   lst_ids=lst_pat, 
                                   vital_name=key, 
                                   realistic_bounds=realistic, 
                                   offset=(0, 1440), 
                                   agg_total=direction, 
                                   timeunit=30)

    df_clm = df_temp.rename(columns={internal_clm_name: clm_name})
    lst_clms_pesi_vitals.append(df_clm)

lst_clms_pesi_vitals_indexed = [df.set_index("patientunitstayid") for df in lst_clms_pesi_vitals]
df_final_vitals_pesi = pd.concat(lst_clms_pesi_vitals_indexed, axis=1)

df_final_vitals_pesi = df_final_vitals_pesi.reset_index()

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:31<00:00,  7.79s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.96s/it]


In [31]:
# Merge the results dataframe to the growing final dataframe
df_pe = df_pe.merge(
    right=df_final_vitals_pesi,
    how="left",
    on="patientunitstayid"
)

In [32]:
# respiratory rate, SaO2, HR, Sys, mean, dias BP
dict_vitals = {
    "heartrate": (20, 200),
    "temperature": (32, 43),
    "respiration": (3, 80),
    "sao2": (50, 100),
}

dict_vitals_combined = {
    "systolic": (20, 250),
    "diastolic": (5, 180),
    "mean_bp": (10, 200)
}

lst_clms_vitals_general = []

# for the "periodic" vitals
for key, values in tqdm(dict_vitals.items()):
    
    if key == "temperature":
        df_temp = fast_vitals_periodic(df_periodic=df_periodic, 
                                       lst_ids=lst_pat, 
                                       vital_name=key, 
                                       realistic_bounds=values, 
                                       offset=(0, 1440),  
                                       agg_total="median", 
                                       timeunit=30, 
                                       temp_nurse_bool=True, 
                                       temp_nursechart=df_nurseCharting)
            
    else:
        df_temp = fast_vitals_periodic(df_periodic=df_periodic, 
                                       lst_ids=lst_pat, 
                                       vital_name=key, 
                                       realistic_bounds=values, 
                                       offset=(0, 1440), 
                                       agg_total="median",  
                                       timeunit=30)

    lst_clms_vitals_general.append(df_temp)

# for the vitals (the blood pressures) that use both the periodic and aperiodic datafiles
for key, values in tqdm(dict_vitals_combined.items()):

    df_temp = fast_vitals_combined(df_periodic=df_periodic, 
                                   df_aperiodic=df_a_periodic,
                                   lst_ids=lst_pat, 
                                   vital_name=key, 
                                   realistic_bounds=values, 
                                   offset=(0, 1440), 
                                   agg_total="median", 
                                   timeunit=30)

    lst_clms_vitals_general.append(df_temp)

lst_clms_vitals_general_indexed = [df.set_index("patientunitstayid") for df in lst_clms_vitals_general]
df_final_vitals_general = pd.concat(lst_clms_vitals_general_indexed, axis=1)

df_final_vitals_general = df_final_vitals_general.reset_index()

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:31<00:00,  7.88s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:20<00:00,  6.99s/it]


In [33]:
# Merge the results dataframe to the growing final dataframe
df_pe = df_pe.merge(
    right=df_final_vitals_general,
    how="left",
    on="patientunitstayid"
)

## Time-to-death

In [34]:
df_ttd_pe = time_to_death_from_unit_admission(df_pat=df_pat, 
                                              df_apache_res=df_apacheresult, 
                                              lst_ids=lst_pat)

In [35]:
df_pe = df_pe.merge(
    right=df_ttd_pe,
    how="left",
    on="patientunitstayid"
)

# PESI score calculation

## Functions

In [36]:
def map_vitals_from_two_sources(x, df_ref_data, clm_aps, clm_own, threshold, aps_midpoint, side):
    """
    Decide whether vitals are above or below a certain threshold based on both the APS extreme vitals and under certain conditions based on my own calculations:
    The APS vitals are extremes from a certain midpoint. However, this goes both way. E.g. heartrate is measured as an extreme from the midpoint 75 in the APS.
    This means that theoretically a recorded APS heartrate of 38 does not exlcude that this patient had a heartrate of 111 at some point as 38 is more extreme.
    Therefore, in case 

    :param x: x - lambda x parameter
    :param df_ref_data: Dataframe
    :param clm_aps: String - column name of the aps vitals
    :param clm_own: String - column name of my vitals
    :param threshold: Int/Float - threshold, is NOT included (is used with > / < and NOT >= / <=! )
    :param aps_midpoint: Int/Float - midpoint value the aps is based on
    :param side: String, picklist - "below" or "above"
    :return: value for mapping
    """
    
    # calculating the counterthreshold 
    if side == "below":
        if threshold > aps_midpoint:
            counterthresh = aps_midpoint + (threshold - aps_midpoint)
        else:
            counterthresh = aps_midpoint + (aps_midpoint - threshold)
    if side == "above":
        if threshold < aps_midpoint:
            counterthresh = aps_midpoint - (aps_midpoint - threshold)
        else:
            counterthresh = aps_midpoint - (threshold - aps_midpoint) 
    
    # values for 1 patient as this function is applied row-wise with 1 row representing 1 patient
    value_aps = df_ref_data.loc[x][clm_aps]
    value_own = df_ref_data.loc[x][clm_own]
    
    # if there is no data in both calculations return NaN
    if np.isnan(value_aps) and np.isnan(value_own):
        return np.nan
    
    # if there is no APS data (but my own data), use only my calculations
    if np.isnan(value_aps):
        # if we are looking for vitals above a certain threshold
        if side == "above":
            if value_own > threshold:
                return 1    
        # if we are looking for vitals below a certain threshold
        if side == "below":
            if value_own < threshold:
                return 1
        # if the vitals is not above/below a certain threshold
        else:
            return 0
        
    # if there is no own data (but APS data), use only the APS data
    if np.isnan(value_own):
        # if we are looking for vitals above a certain threshold
        if side == "above":
            if value_aps > threshold:
                return 1  
        # if we are looking for vitals below a certain threshold
        if side == "below":
            if value_aps < threshold:
                return 1 
        # if the vitals is not above/below a certain threshold
        else:
            return 0

    # if there is data in both the APS column and my calculcations, use first the APS data and only
    # under certain circumstances (see docstring) my own calculations instead
    if side == "above":
        if value_aps > threshold:
            return 1
        if value_aps < counterthresh:
            if value_own > threshold:
                return 1
        else:
            return 0
        
    if side == "below":
        if value_aps < threshold:
            return 1
        if value_aps > counterthresh:
            if value_own < threshold:
                return 1
        else:
            return 0
    else:
        return np.nan


## Calculation

In [37]:
# the demographic components
df_pe["PESI_age"] = df_pe["age"]
df_pe["PESI_gender"] = df_pe["gender"].map({"Male": 1, "Female":0})

In [38]:
### the pmh components:
# chronic pulmonary disease: if the patient has COPD or asthma or restrictive lung disease or uses home-O2 or is s/p LuTx
df_pe["PESI_pulm"] = 0
df_pe.loc[(df_pe['pmh_COPD'] != 0) | (df_pe['pmh_asthma'] != 0) | (df_pe['pmh_home_o2'] != 0) |
          (df_pe['pmh_restrictive_lung_disease'] != 0) | (df_pe['pmh_s_p_lungTx'] != 0), ["PESI_pulm"]] = 1
df_pe.loc[(df_pe['pmh_COPD'].isna()) & (df_pe['pmh_asthma'].isna()) & (df_pe['pmh_home_o2'].isna()) &
          (df_pe['pmh_restrictive_lung_disease'].isna()) & (df_pe['pmh_s_p_lungTx'].isna()), ["PESI_pulm"]] = np.nan

# heart failure
df_pe["PESI_hf"] = df_pe.pmh_CHF.map(lambda x: 1 if x != 0 else 0)


# cancer: if the patient has any type of cancer or has received cancer therapy in the past
df_pe["PESI_cancer"] = 0
df_pe["PESI_cancer"] = df_pe.apply(lambda x: 1 if (x.pmh_cancer_therapy != 0 or 
                                                   x.pred_metastaticcancer > 0 or 
                                                   x.pmh_cancer != 0 or
                                                   x.pred_lymphoma > 0 or 
                                                   x.pred_leukemia > 0) else 0, axis=1)

df_pe.loc[(df_pe["pmh_cancer_therapy"].isna()) &
          (df_pe["pmh_cancer"].isna()) &
          (df_pe["pred_lymphoma"].isna()) &
          (df_pe["pred_metastaticcancer"].isna()) &
          (df_pe["pred_leukemia"].isna()), ["PESI_cancer"]] = np.nan


In [39]:
# creating a reference dataframe
df_pe_ref = df_pe.copy().set_index("patientunitstayid")

# Heartrate component (heartrate >= 110/min), the APS midpoint is at 75
df_pe["PESI_pulse"] = df_pe["patientunitstayid"].map(
    lambda x: map_vitals_from_two_sources(x, df_pe_ref, "aps_heartrate", "pesi_heartrate_own", 109, 75, "above"))

# Systolic BP component (systolic BP < 100 mmHg): Only from our own calculations as this is not included in the APS
df_pe["PESI_systolic"] = df_pe.pesi_systolic_own.map(lambda x: 1 if x < 100 else 0)

# Temperature component (Temp < 36°C), the APS midpoint is 36
df_pe["PESI_temp"] = df_pe["patientunitstayid"].map(
    lambda x: map_vitals_from_two_sources(x, df_pe_ref, "aps_temperature", "pesi_temperature_own", 36, 38, "below"))

# Respiratory rate component (RR >= 30/min), the APS midpoint is 19 
df_pe["PESI_resp"] = df_pe["patientunitstayid"].map(
    lambda x: map_vitals_from_two_sources(x, df_pe_ref, "aps_respiratoryrate", "pesi_respiration_own", 29, 19, "above"))

# SpO2 component (SpO2 < 90%): Only from our own calculations as this is not included in the APS
df_pe["PESI_o2"] = df_pe.pesi_sao2_own.map(lambda x: 1 if x < 90 else 0)

# AMS component (GCS verbal < 5)aps_verbal, aps_meds
df_pe["PESI_ams"] = df_pe.aps_verbal.map(lambda x: 1 if x < 5 else 0)

# setting any missing values in the vitals to 0, in line with the derivation/validation study of the PESI (Aujesky et al. (2005)) 
df_pe.PESI_pulse = df_pe.PESI_pulse.fillna(value=0)
df_pe.PESI_systolic = df_pe.PESI_systolic.fillna(value=0)
df_pe.PESI_temp = df_pe.PESI_temp.fillna(value=0)
df_pe.PESI_resp = df_pe.PESI_resp.fillna(value=0)
df_pe.PESI_o2 = df_pe.PESI_o2.fillna(value=0)

In [40]:
## PESI score and PESI classes
# calculate the PESI score
df_pe["PESI_score"] = df_pe.apply(lambda row: row["age"] + row["PESI_gender"]*10 +
                                              row["PESI_cancer"]*30 + row["PESI_hf"]*10 +
                                              row["PESI_pulm"]*10 + row["PESI_pulse"]*20 +
                                              row["PESI_systolic"]*30 + row["PESI_resp"]*20 +
                                              row["PESI_temp"]*20 + row["PESI_ams"]*60 +
                                              row["PESI_o2"]*20, axis=1)

# calculate the PESI classes
df_pe["PESI_class"] = np.nan
df_pe.loc[df_pe.PESI_score > 125, ["PESI_class"]] = 5
df_pe.loc[(df_pe.PESI_score >= 106) & (df_pe.PESI_score <= 125), ["PESI_class"]] = 4
df_pe.loc[(df_pe.PESI_score >= 86) & (df_pe.PESI_score <= 105), ["PESI_class"]] = 3
df_pe.loc[(df_pe.PESI_score >= 66) & (df_pe.PESI_score <= 85), ["PESI_class"]] = 2
df_pe.loc[df_pe.PESI_score < 66, ["PESI_class"]] = 1

# sPESI score calculation

In [41]:
# sPESI age component (> 80 years old)
df_pe["sPESI_age"] = df_pe["PESI_age"].map(lambda x: 1 if x > 80 else 0)


# from the PESI directly
df_pe["sPESI_systolic"] = df_pe["PESI_systolic"]
df_pe["sPESI_pulse"] = df_pe["PESI_pulse"]
df_pe["sPESI_o2"] = df_pe["PESI_o2"]
df_pe["sPESI_cancer"] = df_pe["PESI_cancer"]

In [42]:
# chronic cardiopulmonary disease: if the patient had either chronic pulmonary disease or a range of cardiac PMH
df_pe["sPESI_cardiopulm"] = df_pe.apply(lambda x: 1 if (x.PESI_pulm > 0 or 
                                                        x.PESI_hf > 0 or 
                                                        x.pmh_MI != 0 or
                                                        x.pmh_pacemaker != 0 or 
                                                        x.pmh_AICD != 0 or
                                                        x.pred_midur > 0 or
                                                        x.pmh_CA_bypass != 0) else 0, axis=1)

In [43]:
# calculcation of the sPESI score
df_pe["sPESI_score"] = df_pe["sPESI_age"] + df_pe["sPESI_cancer"] + df_pe["sPESI_cardiopulm"] + \
                                df_pe["sPESI_pulse"] + df_pe["sPESI_systolic"] + df_pe["sPESI_o2"]

# ICU-sPESI calculation

In [44]:
# calculation of the ICU-sPESI score
df_pe["ICU_sPESI_score"] = df_pe.apply(lambda x: x.sPESI_score + 
                                                 x.PESI_ams + 
                                                 x.infusion_vaso_ino + 
                                                 x.pred_oobintubday1, axis=1)

# Patient exclusion

In [45]:
df_pe.shape

(1697, 108)

&rarr; 1697 patients total

In [46]:
# excluding patients with missing data in the APACHE-IV sore, regarding in-hospital mortality, gender and the GCS verbal component
df_pe = df_pe.dropna(subset=["gender", "apachescore", "aps_verbal", "hospitaldischargestatus"])

In [47]:
df_pe.shape

(1427, 108)

&rarr; 270 patients excluded due to missing data

In [48]:
# excluding patients with an age < 18
df_pe = df_pe[df_pe['age'] >= 18].copy()

In [49]:
df_pe.shape

(1424, 108)

&rarr; 3 patients excluded due to age < 18 years old

# Further data processing

## Functions


In [50]:
def get_dict_for_categorical_with_0(df, column, printoption=False):
    """
    Takes a column from a dataframe and forms a simple 1 to n categorical dictionary from the value counts.

    :param df: Dataframe
    :param column: String - column name
    :param printoption: Boolean - whether to print the actual dictionary with value counts
    :return: Dictionary
    """

    df_val_counts = pd.DataFrame((df[column].value_counts())).reset_index()
    df_val_counts.columns = ['unique_values', 'counts']
    df_val_counts_2 = df_val_counts[df_val_counts.unique_values != 0].copy()
    df_val_counts_2.index = np.arange(1, len(df_val_counts_2) + 1)

    dict_to_cat = dict(zip(df_val_counts_2["unique_values"], df_val_counts_2.index))
    dict_to_cat[0] = 0

    if printoption == True:
        print(dict_to_cat)

    return dict_to_cat

In [51]:
def map_pmh_cancer(x):
    """ mapping the raw cancer string data to cancer sites and then cancer groups """
    # if the patient does not have cancer
    if x == 0:
        return 0
    
    # extract the specific cancer site
    if "Cancer-Primary Site/" in x:
        lst_pmh_cancer = x.split("|")
        lst_site = [i for i in lst_pmh_cancer if "Cancer-Primary Site/" in i]

        if len(lst_site) > 1:
            cancer_site = "multiple"

        else:
            site_description = lst_site[0]
            lst_final = site_description.split("/")
            cancer_site = lst_final[1]

    else:
        cancer_site = "other"
    
    # group the cancer based on cancer site
    # the keys of the dictionary represent all unique values for this column for this cohort
    dict_cancer_sites = {
        0: "No_cancer",
        'other': "other",
        'breast': "Breast",
        'lung': "Respiratory",
        'colon': "GI",
        'prostate': "Genitourinary",
        'uterus': "Genitourinary",
        'bladder': "Genitourinary",
        'pancreas - adenocarcinoma': "GI",
        'melanoma': "other",
        'brain': "other",
        'kidney': "Genitourinary",
        'ovary': "Genitourinary",
        'esophagus': "GI",
        'bile duct': "GI",
        'liver': "GI",
        'multiple': "other",
        'bone': "other",
        'head and neck': "other",
        'sarcoma': "other",
        'unknown': "other",
        'testes': "Genitourinary",
        'stomach': "GI"
    }
    
    cancer_group = dict_cancer_sites[cancer_site]

    return cancer_group

## Data processing

In [52]:
df_pe.gender = df_pe.gender.map({"Female":0, "Male":1})
df_pe = df_pe.rename(columns={"pmh_hemolytic _anemia": "pmh_hemolytic_anemia"})

In [53]:
# Binarize certain PMH columns
lst_clms_binary = ["pmh_HT_with_treatment", "pmh_MI", "pmh_angina", "pmh_strokes", "pmh_periph_vasc_disease", "pmh_CA_bypass",
                  "pmh_PCI", "pmh_pacemaker", "pmh_AICD", "pmh_venous_thrombosis", "pmh_asthma", "pmh_hemolytic_anemia",
                  "pmh_aplastic_anemia", "pmh_clotting_disorder", "pmh_hypercoagulable_condition", "pmh_hypothyroidism", "pmh_hyperthyroidism", 
                  "pmh_CHF", "pmh_restrictive_lung_disease", "pmh_card_valvular", 'pmh_home_o2', 'pmh_seizures', 'pmh_dementia', 'pmh_neuromusk_disease',
                  'pmh_intracranial_mass', 'pmh_sickle_cells', 'pmh_liver_cirrhosis', 'pmh_ITP']

for i in lst_clms_binary:
    df_pe.loc[(df_pe[i] != 0) & (df_pe[i].notna()), i] = 1
    
    
dict_clms_to_binary = {
    "pmh_cancer": "pmh_cancer_binary",
    "pmh_insulin_dep_DM" : "pmh_diabetes_binary",
    "pmh_COPD": "pmh_COPD_binary",
    "pmh_arrhythmias": "pmh_arrhythmias_binary",
    "pmh_renal_insuff": "pmh_renal_insuff_binary",
    "pmh_renal_failure": "pmh_renal_failure_binary"   
}

for orig_clm, new_clm in dict_clms_to_binary.items():
    df_pe[new_clm] = df_pe[orig_clm].map(lambda x: 1 if (x!=0 and x!=np.nan) else 0)


In [54]:
# group the cancer column depending on the site of the cancer (for details see function above)
df_pe["pmh_cancer_grouped"] = df_pe["pmh_cancer"].map(lambda x: map_pmh_cancer(x))

In [55]:
# grouping the diabetes column depending on the type of diabetes (insulin dependent, only medication dependent and without any medication)
def map_pmh_diabetes(x):
    """ process the raw string data of the pmh_insulin_dep_DM column """
    if x == 0:
        return 0

    if "non-medication" in x:
        return "dm_without_treatment"

    if x == "medication dependent":
        return "medication_only"

    if "insulin" in x:
        return "including_Insulin"

df_pe["pmh_diabetes"] = df_pe["pmh_insulin_dep_DM"].map(lambda x: map_pmh_diabetes(x))

In [56]:
# grouping the COPD column depending on the severity of the COPD
# the keys of the dictionary represent all unique values for this column for this cohort
dict_pmhCOPD = {
    0: 0,
    "COPD  - moderate": "COPD_moderate",
    "COPD  - no limitations": "COPD_mild",
    "COPD  - severe": "COPD_severe",
    "COPD  - moderate|COPD  - severe": "COPD_severe"
}

df_pe["pmh_COPD"] = df_pe["pmh_COPD"].map(dict_pmhCOPD)

In [57]:
# grouping the cardiac arrythmias column depending on whether the recorded arrythmias included atrial fibrillation or not
def map_pmh_arrhythmias(x):
    if x == 0:
        return 0

    if "atrial fibrillation" in x:
        return "Afib_orwith"

    else:
        return "other_arrhythmia"


df_pe["pmh_arrhythmias"] = df_pe["pmh_arrhythmias"].map(lambda x: map_pmh_arrhythmias(x))

In [58]:
# grouping the COPD column depending on whether the patients were on dialysis or not
# the keys of the dictionary represent all unique values for this column for this cohort
dict_renal_failure = {
    0: 0,
    "renal failure - hemodialysis": "renal_fail_w_dialysis",
    "renal failure- not currently dialyzed": "renal_fail_no_dialysis",
    "renal failure - peritoneal dialysis": "renal_fail_w_dialysis"
}

df_pe["pmh_renal_failure"] = df_pe["pmh_renal_failure"].map(dict_renal_failure)

In [59]:
# grouping the previous PE column depending on whether it was a single or multiple previous PE
def map_pmh_PE(x):
    if x == 0:
        return 0

    if "multiple" in x:
        return "multiple_PE"

    else:
        return "single_PE"

df_pe["pmh_PE"] = df_pe["pmh_PE"].map(lambda x: map_pmh_PE(x))

In [60]:
### grouping other comorbidities
# Coronary artery disease and other large vessel disease: myocardial infarction, angina, strokes, peripheral vascular disease, 
# coronary artery bypass, percutaneous coronary intervention
df_pe["pmh_CAD_and_other_large_vessel"] = df_pe.apply(lambda x: 1 if x.pmh_MI == 1 or x.pmh_angina == 1 or x.pmh_strokes == 1 or x.pmh_periph_vasc_disease == 1 
                                                      or x.pmh_CA_bypass == 1 or x.pmh_PCI == 1 else 0, axis=1)

# pacemaker: either a normal pacemaker or an AICD
df_pe["pmh_any_pacemaker"] = df_pe.apply(lambda x: 1 if x.pmh_pacemaker == 1 or x.pmh_AICD == 1 else 0, axis=1)

# Venous thomboses & PE
df_pe["pmh_venous_thromb_and_PE"] = df_pe.apply(lambda x: 1 if x.pmh_venous_thrombosis == 1 or (x.pmh_PE!=0 and x.pmh_PE!=np.nan) else 0, axis=1)

# obstructive lung disease: COPD and asthma
df_pe["pmh_obstructive_LD"] = df_pe.apply(lambda x: 1 if (x.pmh_COPD!=0 and x.pmh_COPD!=np.nan) or x.pmh_asthma == 1 else 0, axis=1)

# anemias
df_pe["pmh_anemias"] = df_pe.apply(lambda x: 1 if x.pmh_hemolytic_anemia == 1 or x.pmh_aplastic_anemia == 1 else 0, axis=1)

# thyroid diseases (bother hyper- and hpoythyroidism)
df_pe["pmh_thyroid_disease"] = df_pe.apply(lambda x: 1 if x.pmh_hypothyroidism == 1 or x.pmh_hyperthyroidism == 1 else 0, axis=1)

In [61]:
# creating age categories for a better overview over the distribution of that variable
def get_age_cat_for_table(x):
    if x>=81: return ">80"
    if x>=71: return "71-80"
    if x>=61: return "61-70"
    else: return "<60"

df_pe["age_categories"] = df_pe.age.map(lambda x: get_age_cat_for_table(x))

# Export 


In [62]:
df_pe.to_excel("PE_data/PE_final_data.xlsx", index=False)