Import the necessary python modules

In [1]:
# Import the necessary python modules
import pandas as pd # v.1.4.3
import numpy as np # v.1.23.0

from tqdm import tqdm # v.4.65.0

import math # python v.3.9.0

import warnings
warnings.filterwarnings('ignore') # please feel free to remove this 

# Finding patients with a primary PE admission diagnosis


Selecting all ICU-stays that had Pulmonary Embolism (PE) as their main (APACHE) admission diagnosis:

In [2]:
df_pat_big = pd.read_csv("../../eICU_data/patient.csv", low_memory=False)

In [3]:
[i for i in df_pat_big.apacheadmissiondx.value_counts().index.to_numpy() if "Embolus" in i]

['Embolus, pulmonary']

In [4]:
df_pat_big_pe = df_pat_big[df_pat_big.apacheadmissiondx == "Embolus, pulmonary"].copy()
lst_pat_primary = df_pat_big_pe.patientunitstayid.to_list()
len(lst_pat_primary)

1697

&rarr; 1697 ICU-admissions with a primary diagnosis of PE

# Cohort selection Secondary PE

In [5]:
df_diagnosis_all = pd.read_csv("../../eICU_data/diagnosis.csv", low_memory=False)

In [6]:
# finding patients with a diagnosis of PE documented during their ICU-stay
df_diagnosis_all_pe = df_diagnosis_all[(df_diagnosis_all.diagnosisstring.str.contains("pulmonary embolism", case=False)) & 
                                       (~df_diagnosis_all.diagnosisstring.str.contains("r/o pulmonary embolism", case=False, regex=False))].copy()

# exclude all patients that had a primary PE admission diagnosis
df_diagnosis_secondary_PE = df_diagnosis_all_pe[~df_diagnosis_all_pe.patientunitstayid.isin(lst_pat_primary)]

lst_secondary_pe = list(df_diagnosis_secondary_PE.patientunitstayid.unique())
len(lst_secondary_pe)

1276

In [7]:
# time of diagnosis of PE
first_pe_diag_secondary_pe = df_diagnosis_secondary_PE.groupby('patientunitstayid')['diagnosisoffset'].min().reset_index()
dict_sec_pe_times = dict(zip(first_pe_diag_secondary_pe["patientunitstayid"], first_pe_diag_secondary_pe["diagnosisoffset"]))

In [8]:
# the data shall be from the 24h around the secondary PE diagnosis time -> 12h before/12h. But if the secondary PE was diagnosed before 12h (720 min) into the ICU stay, then,
# for the calculations, this midpoint has to be shifted to 720 minutes to allow this interval
dict_sec_pe_adjusted = {}
    
minutes = (24 * 60 ) / 2

for k, v in dict_sec_pe_times.items():
    if v < minutes:
        dict_sec_pe_adjusted[k] = int(minutes)
    else:
        dict_sec_pe_adjusted[k] = v

# Reducing the file sizes

As the original file sizes are quite large, we take a copy of all files with just the information of our PE patients. This will speed up data extraction/calculations down the line

## Function

In [9]:
def make_reduced_files(import_path, export_path, notation, lst_ids):
    """
    Takes an import path (folder where all the eICU files are), an export path, a "notation" (will be added behind the file name as an identifier) and a list of ids, and then returns the reduced
    files of eICU for only that population (except for the hospital table).

    :param import_path: String - Folder where all the eICU data is stored.
    
    :param export_path: String - Should only include the folder, not the filename, and should end with "/".

    :param notation: String - Short notation that will be added to each file name for future identification.

    :param lst_ids: List - List of target population patientunitstayids.

    :return: Saves a list of abbreviated dataframes to the specified export_path.
    """

    # List of all tables in the eICU database, except the hospital table as it is not connected to patientunitstayid
    lst_tables = ["admissionDrug", "admissionDx", "allergy", "apacheApsVar", "apachePatientResult", "apachePredVar",
                  "carePlanCareProvider", "carePlanEOL", "carePlanGeneral", "carePlanGoal", "carePlanInfectiousDisease", "customLab",
                  "diagnosis", "infusionDrug", "intakeOutput", "lab", "medication", "microLab", "note",
                  "nurseAssessment", "nurseCare", "nurseCharting", "pastHistory", "patient", "physicalExam",
                  "respiratoryCare", "respiratoryCharting", "treatment", "vitalAperiodic", "vitalPeriodic"]

    # Looping over all tables, selecting only the data pertaining to the cohort of interest and then saving these files in a specified location
    for table in tqdm(lst_tables):
        df_chunk = pd.read_csv(f"{import_path}{table}.csv", chunksize=100000, low_memory=False)
        lst_dataframes = []
        
        for chunk in df_chunk:
            df_temp = chunk[chunk["patientunitstayid"].isin(lst_ids)]
            lst_dataframes.append(df_temp)

        df_small = pd.concat(lst_dataframes)
        df_small.to_csv(f"{export_path}{table}_{notation}.csv", index=False)

## Implementation

In [10]:
#make_reduced_files(
#    import_path = "../../eICU_data/",
#    export_path = "sec_PE_data/",
#    notation = "sec_PE",
#    lst_ids = lst_secondary_pe
#)

To make this code work in your environment, the complete unpacked eICU data has to be located at the relative path "../../eICU_data/" in your project.

Now all of the data for our PE patients is in the folder at the relative path "sec_PE_data/" and can be accessed from there.


# Data extraction


## Functions

In [11]:
def get_basic_patient_info(df_pat, lst_ids):
    """
    Takes the patient dataframe of the eICU database, a list of target patientunitstayids and
    returns a dataframe with "cleaned" info per patientunitstayid.

    :param df_pat: DataFrame - Patient dataframe of eICU (or abbreviated). 

    :param lst_ids: List - List of the population patientunitstayids.

    :return: DataFrame with the information.
    """

    lst_columns = ["patientunitstayid", "uniquepid", "gender", "age", "hospitalid", "wardid",
                   "unittype", "apacheadmissiondx", "hospitaldischargestatus",
                   'hospitaladmittime24', "hospitaladmitsource", "hospitaldischargelocation", 'unitadmittime24',
                   'unitadmitsource', 'unitstaytype', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus', "unitvisitnumber"]

    # reducing the general pat_df to reduce the computational load
    df_temp = df_pat.loc[df_pat["patientunitstayid"].isin(lst_ids), lst_columns].copy()

    # gender column
    df_temp["gender"] = df_temp["gender"].replace({"Unknown": np.nan, "Other": np.nan})

    # age column
    df_temp["age"] = df_temp["age"].replace("> 89", "90")
    df_temp["age"] = pd.to_numeric(df_temp["age"])
    
    df_final = df_temp[["patientunitstayid", "apacheadmissiondx", "gender", "age", 
                        "hospitaldischargestatus", "unitdischargestatus"]].copy()

    return df_final

In [12]:
def apply_group_pmh_subcat(x, df, clm_name):
    df_one_id = df.loc[(df.patientunitstayid == x), [clm_name]]
    lst_diagnoses = df_one_id[clm_name].unique().tolist()
    final_string = "|".join(lst_diagnoses)
    return final_string

def apply_split_and_rejoin_for_output(str_pmh):
    lst_strings = str_pmh.split("/")
    lst_final = lst_strings[6:]

    if len(lst_final) == 1:
        return lst_final[0]

    if len(lst_final) > 1:
        joined = "/".join(lst_final)
        return joined

def get_pastHistory(df_pmh, lst_ids):
    """
    Receives the pastHistory Dataframe from eICU or abbreviated and a list of target patientunitstayids and returns
    a kind of longformat dataframe with the most important categories/PMH.

    :param df_pmh: DataFrame - pastHistory dataframe from eICU. 

    :param lst_ids: List - list of patientunitstayids from the target population.

    :return: DataFrame with patientunitstayids as the index and columns for each category of PMH
    """
    
    # Dictionary of prospective column names (key) and the string keys to the PMH-string-path of each disease (values)
    dict_subcat_clms = {
        'pmh_HT_with_treatment': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Hypertension Requiring Treatment'],
        'pmh_cancer': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Cancer'],
        'pmh_non_insulin_dep_DM': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Non-Insulin Dependent Diabetes'],
        'pmh_COPD': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'COPD'],
        'pmh_CHF': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Congestive Heart Failure'],
        'pmh_insulin_dep_DM': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Insulin Dependent Diabetes'],
        'pmh_arrhythmias': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Arrhythmias'],
        'pmh_hypothyroidism': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Hypothyroidism'],
        'pmh_MI': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Myocardial Infarction'],
        'pmh_strokes': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Strokes'],
        'pmh_renal_insuff': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Renal Insufficiency'],
        'pmh_PCI': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Procedural Coronary Intervention'],
        'pmh_card_valvular': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Valve disease'],
        'pmh_asthma': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Asthma'],
        'pmh_liver_cirrhosis': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 'Cirrhosis'],
        'pmh_renal_failure': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Renal Failure'],
        'pmh_CA_bypass': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Coronary Artery Bypass'],
        'pmh_seizures': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Seizures'],
        'pmh_periph_vasc_disease': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Peripheral Vascular Disease'],
        'pmh_home_o2': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Home Oxygen'],
        'pmh_venous_thrombosis': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Venous Thrombosis'],
        'pmh_dementia': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Dementia'],
        'pmh_pacemaker': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Pacemaker'],
        'pmh_cancer_therapy': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Cancer Therapy'],
        'pmh_angina': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Angina'],
        'pmh_peptic_ulcer_disease': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 'Peptic Ulcer Disease'],
        'pmh_TIAs': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'TIAs'],
        'pmh_PFTs': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Pulmonary Function Tests'],
        'pmh_resp_failure': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Respiratory Failure'],
        'pmh_AICD': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'AICD'],
        'pmh_PE': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Pulmonary Embolism'],
        'pmh_RA': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Rheumatoid Arthritis'],
        'pmh_mmunosuppression_last_6m': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'Immunosuppression within past 6 months'],
        'pmh_chronic_kidney_stones': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Chronic Stone Disease'],
        'pmh_neuromusk_disease': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Neuromuscular Disease'],
        'pmh_restrictive_lung_disease': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Restrictive Disease'],
        'pmh_s_p_NTx': ['notes/Progress Notes/Past History/Organ Systems/Renal', 's_p Renal Transplant'],
        'pmh_HIV_only': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'HIV only'],
        'pmh_hemolytic _anemia': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Hemolytic Anemia'],
        'pmh_SLE': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'SLE'],
        'pmh_exercise_tolerance': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Exercise Tolerance'],
        'pmh_intracranial_mass': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Intracranial Mass'],
        'pmh_hyperthyroidism': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Hyperthyroidism'],
        'pmh_recent_steroids_>10d': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Recent Steroid Use for > 10 days'],
        'pmh__petite_mal_seizures': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Seizures_petite mal seizures'],
        'pmh_s_p_LTx': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 's_p Liver Transplant'],
        'pmh_hypercoagulable_condition': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Hypercoagulable Condition'],
        'pmh_ITP': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'ITP'],
        'pmh_neurogenic_bladder': ['notes/Progress Notes/Past History/Organ Systems/Renal ', 'Neurogenic Bladder'],
        'pmh_sickle_cells': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Sickle Cell Disease'],
        'pmh_clotting_disorder': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Clotting Disorder'],
        'pmh_AIDS': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'AIDS'],
        'pmh_sarcoidosis': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Sarcoidosis'],
        'pmh_vasculitis': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Vasculitis'],
        'pmh_myeloproliferative_disease': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Myeloproliferative Disease'],
        'pmh_s_p_HTx': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 's_p Heart Transplant'],
        'pmh_aplastic_anemia': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Aplastic Anemia'],
        'pmh_hypercalcemia': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Hypercalcemia'],
        'pmh_hypersplenism': ['notes/Progress Notes/Past History/Organ Systems/Gastrointestinal', 'Hypersplenism'],
        'pmh_scleroderma': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Scleroderma'],
        'pmh_RTA': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'RTA'],
        'pmh_s_p_lungTx': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 's_p Lung Transplant'],
        "pmh_cushings": ["notes/Progress Notes/Past History/Organ Systems/Endocrine", "Cushing's Syndrome"],
        'pmh_dermato': ['notes/Progress Notes/Past History/Organ Systems/Rheumatic', 'Dermato']
    }
    
    # reducing the general pastHistory df to reduce the computational load
    df_temp = df_pmh.loc[df_pmh["patientunitstayid"].isin(lst_ids), ["patientunitstayid", "pasthistorypath", "pasthistoryvalue"]].copy()

    # Replace substrings in pasthistorypath (which would cause issues due to regex expressions later on)
    df_temp["pasthistorypath"] = df_temp["pasthistorypath"].str.replace("Hematology/Oncology", "Hematology-Oncology",
                                                                        regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("s/p", "s_p",
                                                                      regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("TIA(s)", "TIAs",
                                                                      regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("HIV (only)", "HIV only",
                                                                      regex=True)

    df_pmh["pasthistorypath"] = df_pmh["pasthistorypath"].str.replace("Recent Steroid Use (for > 10 days)", "Recent Steroid Use for > 10 days",
                                                                      regex=True)

    # list for the future columns
    lst_columns = []

    # Iterate over the individual diseases and check whether a patient has this PMH or not then saving this as a column to the list
    for clm_name, key_phrases in tqdm(dict_subcat_clms.items()):
        key1 = key_phrases[0]
        key2 = key_phrases[1]

        df_new_clm_raw = df_temp.loc[(df_temp['pasthistorypath'].str.contains(key1, na=False)) &
                                     (df_temp['pasthistorypath'].str.contains(key2, na=False)), ["patientunitstayid", "pasthistorypath"]].copy().drop_duplicates()

        df_new_clm_raw["output_pmh_path"] = df_new_clm_raw["pasthistorypath"].apply(lambda x: apply_split_and_rejoin_for_output(x))

        df_new_clm_reference = df_new_clm_raw.copy()

        df_new_clm_raw[clm_name] = df_new_clm_raw["patientunitstayid"].apply(lambda x: apply_group_pmh_subcat(x, df_new_clm_reference, "output_pmh_path"))

        df_pat_w_data = df_new_clm_raw.loc[:, ["patientunitstayid", clm_name]].copy().drop_duplicates()

        lst_pat_w_data = list(df_new_clm_raw.patientunitstayid.unique())
        lst_pat_no_data = [x for x in lst_ids if x not in lst_pat_w_data]

        df_pat_without_data = pd.DataFrame(lst_pat_no_data, columns=['patientunitstayid'])
        df_pat_without_data[clm_name] = 0

        df_clm_final = pd.concat([df_pat_w_data, df_pat_without_data])
        df_clm_final = df_clm_final.set_index("patientunitstayid")

        lst_columns.append(df_clm_final)

    # Concatenate all the columns and export this with the ids as another column  
    df_final = pd.concat(lst_columns, axis=1)
    df_final = df_final.reset_index()
    df_final = df_final[['patientunitstayid', 'pmh_HT_with_treatment', 'pmh_cancer',
       'pmh_non_insulin_dep_DM', 'pmh_COPD', 'pmh_CHF', 'pmh_insulin_dep_DM',
       'pmh_arrhythmias', 'pmh_MI', 'pmh_strokes', "pmh_hypothyroidism", 
       'pmh_renal_insuff', 'pmh_PCI', 'pmh_card_valvular', 'pmh_asthma',
       'pmh_liver_cirrhosis', 'pmh_renal_failure', 'pmh_CA_bypass',
       'pmh_seizures', 'pmh_periph_vasc_disease', 'pmh_home_o2',
       'pmh_venous_thrombosis', 'pmh_pacemaker',
       'pmh_cancer_therapy', 'pmh_angina', 
       'pmh_TIAs', 'pmh_AICD', 'pmh_PE', 'pmh_dementia', 
       'pmh_neuromusk_disease', 'pmh_restrictive_lung_disease', 'pmh_intracranial_mass',
       'pmh__petite_mal_seizures', 'pmh_s_p_lungTx']]

    return df_final

In [13]:
def get_cleaned_apachePredVar_basics(df_predvar, lst_ids):
    """
    This function takes the apachePredVar Dataframe (or abbreviated) and a list of patientunitstayids and returns a 
    cleaned Dataframe with the information for those unitstays. The dataframe will have the ids as index.

    :param df_predvar: DataFrame - apachePredVar Dataframe or abbreviated. 

    :param lst_ids: List - List of patientunitstayids of the target population.

    :return: DataFrame with patientunitstayid as the index and the data
    """
    
    # reducing the general df to reduce the computational load
    df_temp = df_predvar.loc[df_predvar["patientunitstayid"].isin(lst_ids), 
                             ['patientunitstayid', 'lymphoma', 'metastaticcancer', 'leukemia', 'midur']].copy()
    
    # add the patients that did not have any data at all 
    lst_pat_with_data = list(df_temp.patientunitstayid.unique())
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]
    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=['patientunitstayid','lymphoma', 'metastaticcancer', 'leukemia', 'midur'])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data
    df_final = pd.concat([df_temp, df_pat_without_data])
        
    # Rename columns
    rename_clms = ['lymphoma', 'metastaticcancer', 'leukemia', 'midur']
    df_final = df_final.rename(columns={clm: "pred_{}".format(clm) for clm in rename_clms})

    return df_final

In [14]:
def get_cleaned_apachePatientResult_basics(df_apacheresult, lst_ids):
    """
    This function takes the apachePatientResult Dataframe (or abbreviated) and a list of patientunitstayids and returns a 
    cleaned Dataframe with the information for those unitstays. The dataframe will have the ids as index.

    :param df_apacheresult: DataFrame - apachePatientResult Dataframe or abbreviated. 

    :param lst_ids: List - List of patientunitstayids of the target population.

    :return: DataFrame with patientunitstayid as the index and the data
    """
    
    # reducing the general df to reduce the computational load
    df_temp = df_apacheresult.loc[(df_apacheresult["apacheversion"] == "IVa") & (df_apacheresult["patientunitstayid"].isin(lst_ids)),
                             ['patientunitstayid', 'apachescore','actualiculos','unabridgedhosplos']].copy()

    # set the missing data to NaN
    df_temp.loc[df_temp['apachescore'] == -1, ['apachescore']] = np.nan

    # add the patients that did not have any data at all 
    lst_pat_with_data = list(df_temp.patientunitstayid.unique())
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]
    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=['patientunitstayid',  'apachescore', 'actualiculos', 'unabridgedhosplos'])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data
    df_final = pd.concat([df_temp, df_pat_without_data])

    return df_final


In [15]:
def get_infusion_drugs_individual_offset(df_infusion, dict_ids, additional_dict=None, relative_offset=(-720, 720)):
    """
    This function returns a dataframe indicating whether patients received certain medication classes within a specified timeframe.

    :param df_infusion: DataFrame - infusionDrug Dataframe from eICU.

    :param dict_ids: Dictionary - keys are the patientunitstayids, values the individual offset

    :param additional_dict: Dict - Additional/custom dictionary with the "clm_name": "drugname|drugname|drugname" format.

    :param relative_offset: Tuple, default is (-720, 720) - (lower relative time offest, upper relative time offset)

    :return: DataFrame with the columns of who got which medication.
    """

    # reducing the general df to reduce the computational load
    df_reduced = df_infusion[df_infusion["patientunitstayid"].isin(list(dict_ids.keys()))].copy().dropna(
        subset=["drugname"])

    # add the individual offsets
    lower_rel_offset, upper_rel_offset = relative_offset
    df_reduced["individ_offset"] = df_reduced.patientunitstayid.map(dict_ids)

    # filter for only data in the individual timeframes
    df_reduced = df_reduced[
        (df_reduced["infusionoffset"] <= (df_reduced["individ_offset"] + upper_rel_offset)) &
        (df_reduced["infusionoffset"] >= (df_reduced["individ_offset"] + lower_rel_offset))
        ].copy()

    df_reduced.drugname = df_reduced.drugname.str.lower()

    dict_drugs = {
        "infusion_vaso_ino": 'epinephrine|adrenaline|norepinephrine|levophed|dobutamine|dobutrex|vasopressin|isoprotenerol|isuprel|phenylephrine|neo-synephrine|dopamine|milrinone|isoproterenol',
    }

    if additional_dict is not None:
        dict_drugs.update(additional_dict)

    lst_columns = []

    # loop over the the drug groups and look at whether patients received this as an infusion or not
    for clm_name, drug_str in dict_drugs.items():
        lst_pat_drug = df_reduced.loc[
            df_reduced.drugname.str.contains(drug_str), "patientunitstayid"].copy().unique().tolist()
        lst_pat_wo_drug = [i for i in list(dict_ids.keys()) if i not in lst_pat_drug]

        df_pat_w_data = pd.DataFrame(lst_pat_drug, columns=['patientunitstayid'])
        df_pat_w_data[clm_name] = 1

        df_pat_without_data = pd.DataFrame(lst_pat_wo_drug, columns=['patientunitstayid'])
        df_pat_without_data[clm_name] = 0

        df_clm_final = pd.concat([df_pat_w_data, df_pat_without_data])
        df_clm_final = df_clm_final.set_index("patientunitstayid")

        lst_columns.append(df_clm_final)

    df_final = pd.concat(lst_columns, axis=1)

    return df_final


In [16]:
def get_mechanically_ventilated_individual_offset(df_resp_chart, df_resp_care, df_physical, df_nurse_chart, dict_ids,
                                                  relative_offset=(-720, 720)):
    """
    This function returns, whether patients were mechanically ventilated during the given time interval from an individual offset point.

    :param df_resp_chart: Dataframe - respiratoryCharting dataframe from the eICU
    :param df_resp_care: Dataframe - respiratoryCare dataframe from the eICU
    :param df_physical: Dataframe - physicalExam dataframe from the eICU
    :param df_nurse_chart: Dataframe - nurseCharting dataframe from the eICU
    :param dict_ids: Dictionary - keys are the patientunitstayids, values the individual offset
    :param relative_offset: Tuple, default is (-720, 720) - (lower relative time offest, upper relative time offset)
    :return: DataFrame with a column "mech_vent_individ_start_{lower_rel_offset}to{upper_rel_offset}" of who was mechanically ventilated at some point during this
    timeframe from the start
    """

    # reducing the general df to reduce the computational load
    df_pe_red = df_physical.loc[(df_physical.patientunitstayid.isin(list(dict_ids.keys()))), :].copy()
    df_nc_red = df_nurse_chart.loc[(df_nurse_chart.patientunitstayid.isin(list(dict_ids.keys()))), :].copy()
    df_rt_red = df_resp_chart.loc[(df_resp_chart.patientunitstayid.isin(list(dict_ids.keys()))), :].copy()
    df_re_red = df_resp_care.loc[(df_resp_care.patientunitstayid.isin(list(dict_ids.keys()))), :].copy()

    # add the individual offsets
    lower_rel_offset, upper_rel_offset = relative_offset

    df_pe_red["individ_offset"] = df_pe_red.patientunitstayid.map(dict_ids)
    df_nc_red["individ_offset"] = df_nc_red.patientunitstayid.map(dict_ids)
    df_rt_red["individ_offset"] = df_rt_red.patientunitstayid.map(dict_ids)
    df_re_red["individ_offset"] = df_re_red.patientunitstayid.map(dict_ids)

    # filter for only data in the individual timeframes
    df_pe_red = df_pe_red[
        (df_pe_red["physicalexamoffset"] <= (df_pe_red["individ_offset"] + upper_rel_offset)) &
        (df_pe_red["physicalexamoffset"] >= (df_pe_red["individ_offset"] + lower_rel_offset))
        ].copy()
    df_nc_red = df_nc_red[
        (df_nc_red["nursingchartoffset"] <= (df_nc_red["individ_offset"] + upper_rel_offset)) &
        (df_nc_red["nursingchartoffset"] >= (df_nc_red["individ_offset"] + lower_rel_offset))
        ].copy()
    df_rt_red = df_rt_red[
        (df_rt_red["respchartoffset"] <= (df_rt_red["individ_offset"] + upper_rel_offset)) &
        (df_rt_red["respchartoffset"] >= (df_rt_red["individ_offset"] + lower_rel_offset))
        ].copy()
    df_re_red = df_re_red[
        (df_re_red["respcarestatusoffset"] <= (df_re_red["individ_offset"] + upper_rel_offset)) &
        (df_re_red["respcarestatusoffset"] >= (df_re_red["individ_offset"] + lower_rel_offset))
        ].copy()

    lst_all_vent_lsts = []

    # get the ICU-admissions indicating mechanical ventilation from the nurseCharting table
    df_nc_vent = df_nc_red.loc[
                 ((df_nc_red.nursingchartcelltypevallabel == "O2 Admin Device") &
                  (df_nc_red.nursingchartvalue.isin(
                      ["ventilator", "trach collar", "vent", "VENT", "vented", "ac 10/400/40+5", "AC10 500 60 5"]))) |
                 (df_nc_red.nursingchartcelltypevallabel == "End Tidal CO2"), :].copy()
    lst_nc_vent = list(df_nc_vent.patientunitstayid.unique())
    lst_all_vent_lsts.append(lst_nc_vent)

    # get the ICU-admissions indicating mechanical ventilation from the physicalExamination table
    df_pe_vent = df_pe_red.loc[((df_pe_red.physicalexampath.str.contains("Pulmonary/Airway", regex=False, case=False)) &
                                (df_pe_red.physicalexamvalue.isin(["intubated", "tracheostomy"]))) |
                               ((df_pe_red.physicalexampath.str.contains(
                                   "Constitutional/Vital Sign and Physiological Data/Resp Mode/", regex=False,
                                   case=False)) &
                                (df_pe_red.physicalexamvalue == "ventilated")), :].copy()
    lst_pe_vent = list(df_pe_vent.patientunitstayid.unique())
    lst_all_vent_lsts.append(lst_pe_vent)

    # get the ICU-admissions indicating mechanical ventilation from the respiratoryCare table
    df_re_vent = df_re_red.loc[df_re_red.lowexhmvlimit.notna() |
                               df_re_red.hiexhmvlimit.notna() |
                               df_re_red.lowexhtvlimit.notna() |
                               df_re_red.hipeakpreslimit.notna() |
                               df_re_red.lowpeakpreslimit.notna() |
                               df_re_red.hirespratelimit.notna() |
                               df_re_red.lowrespratelimit.notna() |
                               df_re_red.cuffpressure.notna() |
                               df_re_red.airwaysize.notna() |
                               df_re_red.airwayposition.notna() |
                               df_re_red.airwaytype.isin(["Oral ETT", "Tracheostomy"]), :].copy()
    lst_re_vent = list(df_re_vent.patientunitstayid.unique())
    lst_all_vent_lsts.append(lst_re_vent)

    # get the ICU-admissions indicating mechanical ventilation from the respiratoryCharting table
    df_rt_vent = df_rt_red.loc[
                 ((df_rt_red.respcharttypecat == "respFlowSettings") &
                  (df_rt_red.respchartvaluelabel.isin(
                      ["TV/kg IBW", "Tidal Volume (set)", "Pressure Control", "PEEP"]))) |
                 ((df_rt_red.respcharttypecat == "respFlowPtVentData") &
                  (df_rt_red.respchartvaluelabel.isin(
                      ["Peak Insp. Pressure", "Mean Airway Pressure", "Exhaled MV", "Exhaled TV (machine)",
                       "Plateau Pressure", "Compliance"]))) |
                 ((df_rt_red.respcharttypecat == "respFlowCareData") &
                  (df_rt_red.respchartvaluelabel.isin(
                      ["Set Vt (Servo,LTV)", "Tidal Volume Observed (VT)", "Adult Con Setting Set RR",
                       "Adult Con Setting Set Vt", "Secured at-ETT", "Adult Con Pt/Vent MinuteVentil",
                       "Adult Con Pt/Vent InspiratorTV", "Adult Con Alarms Hi Press Alarm",
                       "Endotracheal Tube Placement", "Tidal Volume, Delivered",
                       "Set Fraction of Inspired Oxygen (FIO2)", "Mechanical Ventilator Compliance",
                       "Mechanical Ventilation Slope", "Mechanical Ventilator Resistance",
                       "Endotracheal Tube Placement Checked", "Mechanical Ventilator High Tidal Volume Alarm",
                       "Mechanical Ventilator Mode"]))) |
                 ((df_rt_red.respcharttypecat == "respFlowCareData") & (df_rt_red.respchartvaluelabel == "O2 Device") &
                  df_rt_red.respchartvalue.isin(["Ventilator", "Trach mask/collar", "ETT", "Ambubag"])), :].copy()
    lst_rt_intub = list(df_rt_vent.patientunitstayid.unique())
    lst_all_vent_lsts.append(lst_rt_intub)

    # combine these lists to a binary column
    lst_pat_intub = list({pat_id for sublist_vent in lst_all_vent_lsts for pat_id in sublist_vent})
    lst_pat_wo_intub = [i for i in list(dict_ids.keys()) if i not in lst_pat_intub]

    final_clm_name = f"mech_vent_individ_start_{lower_rel_offset}to{upper_rel_offset}"

    df_pat_intub = pd.DataFrame(lst_pat_intub, columns=['patientunitstayid'])
    df_pat_intub[final_clm_name] = 1

    df_pat_wo_intub = pd.DataFrame(lst_pat_wo_intub, columns=['patientunitstayid'])
    df_pat_wo_intub[final_clm_name] = 0

    df_final = pd.concat([df_pat_intub, df_pat_wo_intub], axis=0)

    return df_final


In [17]:
def get_AMS_from_individual_offset_GCS(df_physical, df_nurse_chart, dict_ids, relative_offset=(-720, 720)):
    """
    This function returns, whether patients had AMS (defined as a GCS verbal component of <5) in a time interval from a patient-specific offset

    :param df_physical: Dataframe - physicalExam dataframe from the eICU
    :param df_nurse_chart: Dataframe - nurseCharting dataframe from the eICU
    :param dict_ids: Dictionary - keys are the patientunitstayids, values the individual offset
    :param relative_offset: Tuple, default is (-720, 720) - (lower relative time offest, upper relative time offset)
    :return: DataFrame with a column "ams_first_{time_interval}_min" of who had AMS at some point during this
    timeframe from the start
    """
    # reducing the general df to reduce the computational load
    df_pe_red = df_physical.loc[(df_physical.patientunitstayid.isin(list(dict_ids.keys()))), :].copy()
    df_nc_red = df_nurse_chart.loc[(df_nurse_chart.patientunitstayid.isin(list(dict_ids.keys()))), :].copy()

    # add the individual offsets
    lower_rel_offset, upper_rel_offset = relative_offset

    df_pe_red["individ_offset"] = df_pe_red.patientunitstayid.map(dict_ids)
    df_nc_red["individ_offset"] = df_nc_red.patientunitstayid.map(dict_ids)

    # filter for only data in the individual timeframes
    df_pe_red = df_pe_red[
        (df_pe_red["physicalexamoffset"] <= (df_pe_red["individ_offset"] + upper_rel_offset)) &
        (df_pe_red["physicalexamoffset"] >= (df_pe_red["individ_offset"] + lower_rel_offset))
        ].copy()
    df_nc_red = df_nc_red[
        (df_nc_red["nursingchartoffset"] <= (df_nc_red["individ_offset"] + upper_rel_offset)) &
        (df_nc_red["nursingchartoffset"] >= (df_nc_red["individ_offset"] + lower_rel_offset))
        ].copy()

    lst_all_ams_lsts = []

    # get the AMS patients from the physicalExam dataframe
    df_pe_gcs = df_pe_red[
        df_pe_red.physicalexampath.str.contains("Neurologic/GCS/Verbal Score", regex=False, case=False)].copy()
    df_pe_gcs.physicalexamvalue = pd.to_numeric(df_pe_gcs.physicalexamvalue)
    lst_pe_gcs_ams = list(df_pe_gcs[df_pe_gcs.physicalexamvalue <= 4].patientunitstayid.unique())
    lst_all_ams_lsts.append(lst_pe_gcs_ams)

    # get the AMS patients from the nurseCharting dataframe
    df_nc_gcs = df_nc_red[(df_nc_red.nursingchartcelltypevallabel == "Glasgow coma score") & (
            df_nc_red.nursingchartcelltypevalname == "Verbal")].copy()
    df_nc_gcs.nursingchartvalue = pd.to_numeric(df_nc_gcs.nursingchartvalue)
    lst_ns_gcs_ams = list(df_nc_gcs[df_nc_gcs.nursingchartvalue <= 4].patientunitstayid.unique())
    lst_all_ams_lsts.append(lst_ns_gcs_ams)

    df_nc_other_ams = df_nc_red.loc[((df_nc_red.nursingchartcelltypevallabel == "Best Verbal Response") & (
        df_nc_red.nursingchartvalue.str.contains("4|3|2|1|confused|intubated", regex=True, case=False))), :].copy()
    df_nc_other_ams = list(df_nc_other_ams.patientunitstayid.unique())
    lst_all_ams_lsts.append(df_nc_other_ams)

    # combine these lists to a binary column
    lst_pat_w_ams = list({pat_id for sublist_ams in lst_all_ams_lsts for pat_id in sublist_ams})
    lst_pat_wo_ams = [i for i in list(dict_ids.keys()) if i not in lst_pat_w_ams]

    ams_clm_name = f"ams_individ_start_{lower_rel_offset}to{upper_rel_offset}"

    df_pat_w_ams = pd.DataFrame(lst_pat_w_ams, columns=['patientunitstayid'])
    df_pat_w_ams[ams_clm_name] = 1

    df_pat_without_ams = pd.DataFrame(lst_pat_wo_ams, columns=['patientunitstayid'])
    df_pat_without_ams[ams_clm_name] = 0

    df_final = pd.concat([df_pat_w_ams, df_pat_without_ams], axis=0)

    return df_final


In [18]:
def fast_clean_data(array_vitals):
    """
    Discard outliers that are > median + 2 IQR or < median - 2 IQR

    :param lst_values: List - list of values (eg. mean BPs from a certain timeframe of a patient)

    :return: Array - numpy array of cleaned values

    """

    # to detect outliers for the patients own baseline:
    q25, q50, q75 = np.percentile(array_vitals, [25, 50, 75])
    iqr = q75 - q25

    cleaned_array = array_vitals[(array_vitals >= (q50 - 2 * iqr)) & (array_vitals <= (q50 + 2 * iqr))].copy()

    return cleaned_array


def groupby_vitals_per_hour(x, agg_timeunit):
    np_hourly_vitals = x.to_numpy()

    cleaned_hourly = fast_clean_data(np_hourly_vitals)

    if agg_timeunit == "median":
        hour_vital = np.median(cleaned_hourly)
    if agg_timeunit == "max":
        hour_vital = np.max(cleaned_hourly)
    if agg_timeunit == "min":
        hour_vital = np.min(cleaned_hourly)

    return hour_vital


def groupby_vitals_total(x, agg_total):
    np_total_vitals = x.to_numpy()

    if agg_total == "median":
        hour_vital = np.median(np_total_vitals)
    if agg_total == "max":
        hour_vital = np.max(np_total_vitals)
    if agg_total == "min":
        hour_vital = np.min(np_total_vitals)

    return hour_vital


def fast_vitals_periodic_individual_offset(df_periodic, dict_ids, vital_name, realistic_bounds, relative_offset,
                                           agg_timeunit="median", agg_total="median", timeunit=60, temp_nurse_bool=False,
                                           temp_nursechart=None):
    """
    Takes the vitalPeriodic table of eICU, target patientunitstayids and their individual offset as a dictionary, offset bounds, and realistic bounds and a vitalname and
    returns the aggregated and cleaned value for that offset timeframe. There are different options to choose from on how to aggregate that
    value. If looking at the temperature, there is the option to include all temperature measurements from the nurses charting (which often
    primarily includes the temperature values of a patient. Pulsepressure is calculated from the respective BP_sys - BP_dia. (will
    automatically use 20/300 (systolic) and 10/200 (diastolic) as bounds)
    Accepted vitalnames: temperature, sao2, heartrate, respiration, cvp, etco2, systemicsystolic, systemicdiastolic, systemicmean, pasystolic,
    padiastolic, pamean, icp

    :param df_periodic: Dataframe - abbreviated (! unless a lot of free RAM) Dataframe of the vitalPeriodic table of eICU

    :param dict_ids: Dictionary - keys are the patientunitstayids, values the individual offset

    :param vital_name: String - Column name of the vital

    :param realistic_bounds: Tuple - realistic values of the vital in the form of (lower bound, upper bound) eg. (20,100) for fio2

    :param relative_offset: Tuple - (lower relative time offest, upper relative time offset)

    :param agg_timeunit: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values per timeunit (if the
            offset duration is longer than the timeunit)

    :param agg_total: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values to the final value

    :param timeunit: Int, preset 60 - Number of minutes for the timeunit

    :param temp_nurse_bool: Boolean, preset False - Whether to use the nurse charting for the temperature

    :param temp_nursechart: Dataframe - NurseCharting Dataframe of the eICU if the vital_name is temperature and temp_nurse_bool is True

    :return: Dataframe
    """

    # reducing the general df to reduce the computational load
    df_reduced = df_periodic[df_periodic["patientunitstayid"].isin(list(dict_ids.keys()))].copy()

    # open the tuples of the bounds
    lower_realistic_bound, upper_realistic_bound = realistic_bounds

    # calculate the pulsepressure if this is selected; via systolic BP - diastolic BP
    if vital_name == "pulsepressure":
        df_temp_initial = df_reduced.loc[
            (df_reduced["systemicsystolic"] >= 20) & (df_reduced["systemicsystolic"] <= 300) &
            (df_reduced["systemicdiastolic"] >= 10) & (df_reduced["systemicdiastolic"] <= 200),
            ["patientunitstayid", "observationoffset", "systemicsystolic", "systemicdiastolic"]].copy()

        df_temp = (df_temp_initial.assign(
            pulsepressure=df_temp_initial["systemicsystolic"] - df_temp_initial["systemicdiastolic"])
                   .drop(columns=["systemicsystolic", "systemicdiastolic"])
                   .dropna()
                   )
    else:
        # next reduction, incorporating the realistic bounds
        df_temp = df_reduced.loc[
            (df_reduced[vital_name] >= lower_realistic_bound) & (df_reduced[vital_name] <= upper_realistic_bound),
            ["patientunitstayid", "observationoffset", vital_name]].copy().dropna()

    # add the temperature information from the nurse chart if temperature is selected and this feature is used
    if vital_name == "temperature" and temp_nurse_bool:
        # reduce the nurse charting to lessen the computational load
        df_nursechart = temp_nursechart[temp_nursechart["patientunitstayid"].isin(list(dict_ids.keys()))].copy()
        df_n_char_red = df_nursechart.loc[(df_nursechart.nursingchartcelltypevallabel == "Temperature") &
                                          (df_nursechart.nursingchartcelltypevalname.isin(
                                              ['Temperature (C)', 'Temperature (F)'])),
                                          ["patientunitstayid", "nursingchartoffset", "nursingchartcelltypevalname",
                                           "nursingchartvalue"]].copy()

        # adjust for the fact that some values are taken as °F and some as °C
        df_n_char_red.nursingchartvalue = df_n_char_red.nursingchartvalue.astype(float)
        df_n_char_red["temperature"] = df_n_char_red.apply(lambda x: ((x.nursingchartvalue - 32) * (
                5 / 9)) if x.nursingchartcelltypevalname == 'Temperature (F)' else x.nursingchartvalue, axis=1)
        df_n_char_red = df_n_char_red.rename(columns={"nursingchartoffset": "observationoffset"})
        df_n_char_final = df_n_char_red.loc[(df_n_char_red[vital_name] >= lower_realistic_bound) & (
                df_n_char_red[vital_name] <= upper_realistic_bound),
                                            ["patientunitstayid", "observationoffset",
                                             vital_name]].copy().reset_index(drop=True)

        df_temp = pd.concat([df_temp, df_n_char_final])

    # add the individual offsets
    lower_rel_offset, upper_rel_offset = relative_offset
    df_temp["individ_offset"] = df_temp.patientunitstayid.map(dict_ids)

    # filter for only data in the individual timeframes
    df_temp = df_temp[
        (df_temp["observationoffset"] <= (df_temp["individ_offset"] + upper_rel_offset)) &
        (df_temp["observationoffset"] >= (df_temp["individ_offset"] + lower_rel_offset))
        ].copy()

    # Adjust the observation offset to be relative to the individual offset
    df_temp["observationoffset"] -= df_temp["individ_offset"]
    df_temp = df_temp.drop(columns=["individ_offset"])

    # create bins for the timeunit in the given offset interval and then cut the column to these bins
    lower_offset = 0
    upper_offset = abs(lower_rel_offset) + upper_rel_offset

    lst_bins = [i for i in range(lower_offset, upper_offset + 1, timeunit) if i <= upper_offset]

    if (upper_offset - lower_offset) % timeunit != 0:
        lst_bins.append(upper_offset)

    df_temp.observationoffset = pd.cut(df_temp.observationoffset, bins=lst_bins, right=True, include_lowest=True)

    # group first by hour (cleaning the data of outliers) and then over the total offset span
    df_grouped_hours = (df_temp
                        .groupby(["patientunitstayid", "observationoffset"])
                        .agg(lambda x: groupby_vitals_per_hour(x, agg_timeunit=agg_timeunit))
                        .reset_index()
                        .drop(columns=["observationoffset"])
                        .dropna()
                        .groupby(["patientunitstayid"])
                        .agg(lambda x: groupby_vitals_total(x, agg_total=agg_total))
                        .reset_index()
                        )

    # make a dataframe with a single column containing the ids and then merge the results to that
    df_pat = pd.DataFrame({'patientunitstayid': list(dict_ids.keys())})
    clm_name = "{}_{}_individ_offset_rel_{}to{}_u{}".format(vital_name, agg_total, lower_rel_offset, upper_rel_offset,
                                                            timeunit)
    df_pat_final = df_pat.merge(df_grouped_hours, on="patientunitstayid", how="left").rename(
        columns={vital_name: clm_name})

    return df_pat_final


In [19]:
def fast_vitals_combined_individual_offset(df_periodic, df_aperiodic, dict_ids, vital_name, realistic_bounds, relative_offset,
                         agg_timeunit="median", agg_total="median", timeunit=60):
    """
    Takes the both the vitalPeriodic and Aperiodic tables of eICU, target patientunitstayids as a list, offset bounds, and realistic bounds and a vitalname and
    returns the aggregated and cleaned value for that offset timeframe. There are different options to choose from on how to aggregate that
    value. Possible vitalnames: systolic, diastolic, mean_bp, pulsepressure

    :param df_periodic: Dataframe - vitalPeriodic Dataframe of the eICU Database (abbreviated unless a lot of free RAM)

    :param df_aperiodic: Dataframe - vitalAperiodic Dataframe (abbreviated unless a lot of free RAM)

    :param dict_ids: Dictionary - keys are the patientunitstayids, values the individual offset

    :param vital_name: String, picklist - options "systolic", "diastolic", "mean_bp", "pulsepressure"

    :param realistic_bounds: Tuple - realistic values of the vital in the form of (lower bound, upper bound) eg. (20, 250) for systolic

    :param relative_offset: Tuple - (lower relative time offest, upper relative time offset)

    :param agg_timeunit: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values per timeunit
    (if the offset-duration is longer than the timeunit)

    :param agg_total: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values to the final value

    :param timeunit: Int, preset 60 - Number of minutes for the timeunit

    :return: Dataframe
    """

    # dictionary that sorts the vital_names to the respective names of the columns in the periodic and Aperiodic dfs
    dict_vitalnames = {
        "systolic": ("systemicsystolic", "noninvasivesystolic"),
        "diastolic": ("systemicdiastolic", "noninvasivediastolic"),
        "mean_bp": ("systemicmean", "noninvasivemean")
    }

    # reducing the general dfs to reduce the computational load
    df_reduced_periodic = df_periodic[df_periodic["patientunitstayid"].isin(list(dict_ids.keys()))].copy()
    df_reduced_aperiodic = df_aperiodic[df_aperiodic["patientunitstayid"].isin(list(dict_ids.keys()))].copy()

    # open the tuples
    lower_realistic_bound, upper_realistic_bound = realistic_bounds

    # next reduction, incorporating the big offset bounds and the realistic bounds into each dataframe and then adding both
    # datafrems to have one united dataframes with all the vitals values
    # calculate the pulsepressure if this is selected; via systolic BP - diastolic BP
    if vital_name == "pulsepressure":
        df_temp_initial_periodic = df_reduced_periodic.loc[
            (df_reduced_periodic["systemicsystolic"] >= 20) & (df_reduced_periodic["systemicsystolic"] <= 300) &
            (df_reduced_periodic["systemicdiastolic"] >= 10) & (df_reduced_periodic["systemicdiastolic"] <= 200),
            ["patientunitstayid", "observationoffset", "systemicsystolic", "systemicdiastolic"]].copy()

        df_temp_periodic = (df_temp_initial_periodic.assign(
            pulsepressure=df_temp_initial_periodic["systemicsystolic"] - df_temp_initial_periodic["systemicdiastolic"])
                            .drop(columns=["systemicsystolic", "systemicdiastolic"])
                            .dropna()
                            )

        df_temp_initial_aperiodic = df_reduced_aperiodic.loc[
            (df_reduced_aperiodic["noninvasivesystolic"] >= 20) & (df_reduced_aperiodic["noninvasivesystolic"] <= 300) &
            (df_reduced_aperiodic["noninvasivediastolic"] >= 10) & (
                        df_reduced_aperiodic["noninvasivediastolic"] <= 200),
            ["patientunitstayid", "observationoffset", "noninvasivesystolic", "noninvasivediastolic"]].copy()

        df_temp_aperiodic = (df_temp_initial_aperiodic.assign(
            pulsepressure=df_temp_initial_aperiodic["noninvasivesystolic"] - df_temp_initial_aperiodic[
                "noninvasivediastolic"])
                             .drop(columns=["noninvasivesystolic", "noninvasivediastolic"])
                             .dropna()
                             )

    # path for systolic, diastolic and mean_bp
    else:
        key_periodic, key_aperiodic = dict_vitalnames[vital_name]
        df_temp_periodic = df_reduced_periodic.loc[
            (df_reduced_periodic[key_periodic] >= lower_realistic_bound) & (
                        df_reduced_periodic[key_periodic] <= upper_realistic_bound),
            ["patientunitstayid", "observationoffset", key_periodic]].copy().dropna().rename(
            columns={key_periodic: vital_name})
        df_temp_aperiodic = df_reduced_aperiodic.loc[
            (df_reduced_aperiodic[key_aperiodic] >= lower_realistic_bound) & (
                    df_reduced_aperiodic[key_aperiodic] <= upper_realistic_bound),
            ["patientunitstayid", "observationoffset", key_aperiodic]].copy().dropna().rename(
            columns={key_aperiodic: vital_name})

    df_temp_whole = pd.concat([df_temp_periodic, df_temp_aperiodic])

    # add the individual offsets
    lower_rel_offset, upper_rel_offset = relative_offset
    df_temp_whole["individ_offset"] = df_temp_whole.patientunitstayid.map(dict_ids)

    # filter for only data in the individual timeframes
    df_temp_whole = df_temp_whole[
        (df_temp_whole["observationoffset"] <= (df_temp_whole["individ_offset"] + upper_rel_offset)) &
        (df_temp_whole["observationoffset"] >= (df_temp_whole["individ_offset"] + lower_rel_offset))
        ].copy()

    # Adjust the observation offset to be relative to the individual offset
    df_temp_whole["observationoffset"] -= df_temp_whole["individ_offset"]
    df_temp_whole = df_temp_whole.drop(columns=["individ_offset"])

    # create bins for the timeunit in the given offset interval and then cut the column to these bins
    lower_offset = 0
    upper_offset = abs(lower_rel_offset) + upper_rel_offset

    lst_bins = [i for i in range(lower_offset, upper_offset + 1, timeunit) if i <= upper_offset]

    if (upper_offset - lower_offset) % timeunit != 0:
        lst_bins.append(upper_offset)

    df_temp_whole.observationoffset = pd.cut(df_temp_whole.observationoffset, bins=lst_bins, right=True,
                                             include_lowest=True)

    # group first by hour (cleaning the data of outliers) and then over the total offset span
    df_grouped_all = (df_temp_whole
                      .groupby(["patientunitstayid", "observationoffset"])
                      .agg(lambda x: groupby_vitals_per_hour(x, agg_timeunit=agg_timeunit))
                      .reset_index()
                      .drop(columns=["observationoffset"])
                      .dropna()
                      .groupby(["patientunitstayid"])
                      .agg(lambda x: groupby_vitals_total(x, agg_total=agg_total))
                      .reset_index()
                      )

    # make a dataframe with a single column containing the ids and then merge the results to that
    df_pat = pd.DataFrame({'patientunitstayid': list(dict_ids.keys())})
    clm_name = "{}_{}_individ_offset_rel_{}to{}_u{}".format(vital_name, agg_total, lower_rel_offset, upper_rel_offset, timeunit)
    df_pat_final = df_pat.merge(df_grouped_all, on="patientunitstayid", how="left").rename(
        columns={vital_name: clm_name})

    return df_pat_final

In [20]:
def apply_time_to_death_from_unit_admit(x):
    # if the patient did not die, return NaN
    if x.hospitaldischargestatus == "Alive":
        return np.nan
    
    # if the patient died in the ICU then the time on ICU is the time-to-death
    if x.unitdischargestatus == "Expired":
        return x.actualiculos
    
    # if the patient died in the hospital calculate the time to death by subtracting the time to ICU admission
    if x.hospitaldischargestatus == "Expired":
        time_to_death = x.unabridgedhosplos - (x.hospitaladmitoffset/(24*60))
        return time_to_death
    
    else:
        return np.nan
    

def time_to_death_from_unit_admission(df_pat, df_apache_res, lst_ids):
    """
    Compute the time to death from ICU-admission for a cohort of interest. Will return np.nan if the patient did not die and 
    for irregularities.
    
    :param df_pat: Dataframe - patient dataframe of the eICU or abbreviated
    
    :param df_apache_res: Dataframe - apachePatientResult dataframe of the eICU or abbreviated
    
    :param lst_ids: List - List of patientunitstayids
    
    return: Dataframe with 2 columns (patientunitstayids and time_to_death_unitadmit)
    """
    
    # reducing the general dfs to reduce the computational load
    df_pat_red = df_pat.loc[df_pat.patientunitstayid.isin(lst_ids), 
                           ["patientunitstayid", "hospitaldischargestatus", "unitdischargestatus", "hospitaladmitoffset"]].copy()
    df_apache_res_red = df_apache_res.loc[df_apache_res.patientunitstayid.isin(lst_ids),
                                         ["patientunitstayid", 'actualiculos', 'unabridgedhosplos']].copy()
    
    # merging both to have all these columns ready for the apply method
    df_combined = pd.merge(left=df_pat_red, right=df_apache_res_red, how="left", on="patientunitstayid")
    df_combined = df_combined.drop_duplicates()
    
    # calculate the time-to-death in a row-wise approach
    df_combined["time_to_death_unitadmit"] = df_combined.apply(lambda x: apply_time_to_death_from_unit_admit(x), axis=1)
    
    df_final = df_combined[["patientunitstayid", "time_to_death_unitadmit"]].copy()
    
    return df_final

In [21]:
def get_admissionDx_basicinfo_longformat(df_admDx, lst_ids):
    """
    Takes the AdmissionDx Dataframe and gives out basic information (columns: elective, admitted_from_or, non_op_organsystem,
    non_op_dx, op_organsystem, op_dx) as a longformat dataframe. (patientunitstayids as the index)

    :param df_admDx: Dataframe - AdmissionDx Dataframe from eICU or abbreviated

    :param lst_ids: List - List of patientunitstayids of the target population

    :return: Dataframe
    """

    # reduce the big dataframe
    df_temp = df_admDx[df_admDx["patientunitstayid"].isin(lst_ids)].copy()
    df_temp["admitdxpath"] = df_temp["admitdxpath"].str.replace("Non-operative", "Nonoperative", regex=True)

    # columns: admitted_from_OR, elective, non_op_organsystem, non_op_dx, op_organsystem, op_dx
    dict_columns_new = {
        "admdx_admitted_from_OR": "Was the patient admitted from the O.R.",
        "admdx_non_op_organsystem": "Nonoperative Organ Systems",
        "admdx_op_organsystem": "Operative Organ Systems",
    }

    dict_columns_new_2 = {
        "admdx_non_op_dx": ("All Diagnosis", "Nonoperative"),
        "admdx_op_dx": ("All Diagnosis", "Operative")
    }

    # lets try the column wise approach
    lst_columns = []

    for clm_name, key_phrase in tqdm(dict_columns_new.items()):
        # get the values per patientid in a column-wise approach
        df_new_clm_raw = df_temp.loc[
            df_temp['admitdxpath'].str.contains(key_phrase, na=False), ["patientunitstayid", "admitdxpath", "admitdxname"]].copy()
        df_new_clm_raw.rename(columns={'admitdxname': clm_name}, inplace=True)
        df_new_clm = df_new_clm_raw.loc[:, ["patientunitstayid", clm_name]]

        lst_pat_w_data = df_new_clm.patientunitstayid.unique().tolist()
        lst_pat_no_data = [x for x in lst_ids if x not in lst_pat_w_data]
        df_pat_without_elective_data = pd.DataFrame(lst_pat_no_data, columns=['patientunitstayid'])
        df_pat_without_elective_data[clm_name] = np.nan

        df_clm_final = pd.concat([df_new_clm, df_pat_without_elective_data])
        df_clm_final.set_index("patientunitstayid", inplace=True)

        lst_columns.append(df_clm_final)

    for clm_name, key_phrases in tqdm(dict_columns_new_2.items()):
        # get the values per patientid in a column-wise approach
        key_phrase1, key_phrase2 = key_phrases
        df_new_clm_raw = df_temp.loc[(df_temp['admitdxpath'].str.contains(key_phrase1, na=False)) &
                                     (df_temp['admitdxpath'].str.contains(key_phrase2, na=False)),
                                     ["patientunitstayid", "admitdxpath", "admitdxname"]].copy()
        df_new_clm_raw.rename(columns={'admitdxname': clm_name}, inplace=True)
        df_new_clm = df_new_clm_raw.loc[:, ["patientunitstayid", clm_name]]

        lst_pat_w_data = df_new_clm.patientunitstayid.unique().tolist()
        lst_pat_no_data = [x for x in lst_ids if x not in lst_pat_w_data]
        df_pat_without_elective_data = pd.DataFrame(lst_pat_no_data, columns=['patientunitstayid'])
        df_pat_without_elective_data[clm_name] = np.nan

        df_clm_final = pd.concat([df_new_clm, df_pat_without_elective_data])
        df_clm_final.set_index("patientunitstayid", inplace=True)

        lst_columns.append(df_clm_final)

    df_final = pd.concat(lst_columns, axis=1)

    return df_final

## Demographic Data

In [22]:
df_pat = pd.read_csv("sec_PE_data/patient_sec_PE.csv", low_memory=False)

In [23]:
df_patinfo_PE = get_basic_patient_info(df_pat, lst_secondary_pe)

In [24]:
# int-binarize the death columns
df_patinfo_PE.hospitaldischargestatus = df_patinfo_PE.hospitaldischargestatus.map({"Expired": 1, "Alive": 0})
df_patinfo_PE.unitdischargestatus = df_patinfo_PE.unitdischargestatus.map({"Expired": 1, "Alive": 0})

# the demographic PESI components
df_patinfo_PE["PESI_age"] = df_patinfo_PE["age"]
df_patinfo_PE["PESI_gender"] = df_patinfo_PE["gender"].map({"Male": 1, "Female":0})

# sPESI age component (> 80 years old)
df_patinfo_PE["sPESI_age"] = df_patinfo_PE["PESI_age"].map(lambda x: 1 if x > 80 else 0)

## Comorbidities 

In [25]:
df_pmh = pd.read_csv("sec_PE_data/pastHistory_sec_PE.csv", low_memory=False)

In [26]:
df_pmhinfo = get_pastHistory(df_pmh, lst_secondary_pe)

100%|██████████████████████████████████████████████████████████████████████████████████| 64/64 [00:03<00:00, 18.19it/s]


In [27]:
# Merge the dataframe with the Comorbidities to the dataframe with the demographic data
df_pe = pd.merge(
    left=df_patinfo_PE,
    right=df_pmhinfo,
    how="left",
    on="patientunitstayid"
)

## APACHE IVa and associated variables

In [28]:
df_predVar = pd.read_csv("sec_PE_data/apachePredVar_sec_PE.csv", low_memory=False)
df_apacheresult = pd.read_csv("sec_PE_data/apachePatientResult_sec_PE.csv", low_memory=False)

In [29]:
df_predvarinfo = get_cleaned_apachePredVar_basics(df_predVar, lst_secondary_pe)
df_apacheresult_info = get_cleaned_apachePatientResult_basics(df_apacheresult, lst_secondary_pe)

In [30]:
# Merge the results dataframes to the growing final dataframe

df_pe = df_pe.merge(
    right=df_predvarinfo,
    how="left",
    on="patientunitstayid"
)

df_pe = df_pe.merge(
    right=df_apacheresult_info,
    how="left",
    on="patientunitstayid"
)

In [31]:
### the PESI pmh components:
# chronic pulmonary disease: if the patient has COPD or asthma or restrictive lung disease or uses home-O2 or is s/p LuTx
df_pe["PESI_pulm"] = 0
df_pe.loc[(df_pe['pmh_COPD'] != 0) | (df_pe['pmh_asthma'] != 0) | (df_pe['pmh_home_o2'] != 0) |
          (df_pe['pmh_restrictive_lung_disease'] != 0) | (df_pe['pmh_s_p_lungTx'] != 0), ["PESI_pulm"]] = 1
df_pe.loc[(df_pe['pmh_COPD'].isna()) & (df_pe['pmh_asthma'].isna()) & (df_pe['pmh_home_o2'].isna()) &
          (df_pe['pmh_restrictive_lung_disease'].isna()) & (df_pe['pmh_s_p_lungTx'].isna()), ["PESI_pulm"]] = np.nan

# heart failure
df_pe["PESI_hf"] = df_pe.pmh_CHF.map(lambda x: 1 if x != 0 else 0)

# cancer: if the patient has any type of cancer or has received cancer therapy in the past
df_pe["PESI_cancer"] = 0
df_pe["PESI_cancer"] = df_pe.apply(lambda x: 1 if (x.pmh_cancer_therapy != 0 or 
                                                   x.pred_metastaticcancer > 0 or 
                                                   x.pmh_cancer != 0 or
                                                   x.pred_lymphoma > 0 or 
                                                   x.pred_leukemia > 0) else 0, axis=1)

# cancer for the sPESI
df_pe.loc[(df_pe["pmh_cancer_therapy"].isna()) &
          (df_pe["pmh_cancer"].isna()) &
          (df_pe["pred_lymphoma"].isna()) &
          (df_pe["pred_metastaticcancer"].isna()) &
          (df_pe["pred_leukemia"].isna()), ["PESI_cancer"]] = np.nan

# chronic cardiopulmonary disease: if the patient had either chronic pulmonary disease or a range of cardiac PMH
df_pe["sPESI_cardiopulm"] = df_pe.apply(lambda x: 1 if (x.PESI_pulm > 0 or 
                                                        x.PESI_hf > 0 or 
                                                        x.pmh_MI != 0 or
                                                        x.pmh_pacemaker != 0 or 
                                                        x.pmh_AICD != 0 or
                                                        x.pred_midur > 0 or
                                                        x.pmh_CA_bypass != 0) else 0, axis=1)

## Infusions

In [32]:
df_infusions = pd.read_csv("sec_PE_data/infusionDrug_sec_PE.csv", low_memory=False)

In [33]:
df_inf_result_clm = get_infusion_drugs_individual_offset(
    df_infusion=df_infusions, 
    dict_ids=dict_sec_pe_adjusted,  
    relative_offset=(-720, 720)
)

df_inf_result_clm = df_inf_result_clm.rename(columns={"infusion_vaso_ino": f"ICU_sPESI_infusion_vaso_ino"})

In [34]:
# Merge the results dataframe to the growing final dataframe
df_pe = df_pe.merge(
    right=df_inf_result_clm,
    how="left",
    on="patientunitstayid"
)

## AMS

In [35]:
df_nurseCharting = pd.read_csv("sec_PE_data/nurseCharting_sec_PE.csv", low_memory=False)
df_physical = pd.read_csv("sec_PE_data/physicalExam_sec_PE.csv", low_memory=False)

In [36]:
# We defined AMS as a GCS verbal score < 5 (less than full score) 
df_ams_res_clm = get_AMS_from_individual_offset_GCS(
    df_physical=df_physical, 
    df_nurse_chart=df_nurseCharting, 
    dict_ids=dict_sec_pe_adjusted, 
    relative_offset=(-720, 720)
)
    
df_ams_res_clm = df_ams_res_clm.rename(columns={f"ams_individ_start_-720to720": f"PESI_ams"})

In [37]:
# Merge the results dataframe to the growing final dataframe
df_pe = df_pe.merge(
    right=df_ams_res_clm,
    how="left",
    on="patientunitstayid"
)

## Intubation

In [38]:
df_resp_chart = pd.read_csv("sec_PE_data/respiratoryCharting_sec_PE.csv", low_memory=False)
df_resp_care = pd.read_csv("sec_PE_data/respiratoryCare_sec_PE.csv", low_memory=False)

In [39]:
df_intub_res_clm = get_mechanically_ventilated_individual_offset(
    df_resp_chart=df_resp_chart, 
    df_resp_care=df_resp_care, 
    df_physical=df_physical, 
    df_nurse_chart=df_nurseCharting, 
    dict_ids=dict_sec_pe_adjusted,
    relative_offset=(-720, 720)
)

df_intub_res_clm = df_intub_res_clm.rename(columns={f"mech_vent_individ_start_-720to720": f"ICU_sPESI_intub"})

In [40]:
# Merge the results dataframe to the growing final dataframe
df_pe = df_pe.merge(
    right=df_intub_res_clm,
    how="left",
    on="patientunitstayid"
)

## Vitals

In [41]:
df_periodic = pd.read_csv("sec_PE_data/vitalPeriodic_sec_PE.csv", low_memory=False)
df_a_periodic = pd.read_csv("sec_PE_data/vitalAperiodic_sec_PE.csv", low_memory=False)

In [42]:
### "worst" vitals for the score calculation
# dictionaries of extreme vitals that were excluded as outliers
dict_vitals = {
    "heartrate": [(20, 200), "max", 109],
    "temperature": [(32, 43), "min", 36],
    "respiration": [(3, 80), "max", 29],
    "sao2": [(50, 100), "min", 90]
}

dict_vitals_combined = {
    "systolic": [(20, 250), "min", 100]
}

lst_clms_pesi_vitals = []

dict_sec_pe_adjusted

# for the "periodic" vitals
for key, values in tqdm(dict_vitals.items()):
    realistic = values[0]
    direction = values[1]
    thresh = values[2]

    clm_name = "PESI_{}".format(key)
    internal_clm_name = "{}_{}_individ_offset_rel_-720to720_u30".format(key, direction)

    if key == "temperature":
        df_temp = fast_vitals_periodic_individual_offset(
            df_periodic=df_periodic, 
            dict_ids=dict_sec_pe_adjusted, 
            vital_name=key, 
            realistic_bounds=realistic, 
            relative_offset=(-720, 720),  
            agg_total=direction, 
            timeunit=30, 
            temp_nurse_bool=True, 
            temp_nursechart=df_nurseCharting)


    else:
        df_temp = fast_vitals_periodic_individual_offset(
            df_periodic=df_periodic, 
            dict_ids=dict_sec_pe_adjusted, 
            vital_name=key, 
            realistic_bounds=realistic, 
            relative_offset=(-720, 720),  
            agg_total=direction, 
            timeunit=30)


    df_clm = df_temp.rename(columns={internal_clm_name: clm_name})

    if direction == "max":
        df_clm[clm_name] = df_clm[clm_name].map(lambda x: 1 if x > thresh else 0)
    else:
        df_clm[clm_name] = df_clm[clm_name].map(lambda x: 1 if x < thresh else 0)

    lst_clms_pesi_vitals.append(df_clm)

# for vital "systolic" that uses both the periodic and aperiodic datafiles
for key, values in tqdm(dict_vitals_combined.items()):
    realistic = values[0]
    direction = values[1]
    thresh = values[2]

    clm_name = "PESI_{}".format(key)
    internal_clm_name = "{}_{}_individ_offset_rel_-720to720_u30".format(key, direction)

    df_temp = fast_vitals_combined_individual_offset(
        df_periodic=df_periodic, 
        df_aperiodic=df_a_periodic,
        dict_ids=dict_sec_pe_adjusted, 
        vital_name=key, 
        realistic_bounds=realistic, 
        relative_offset=(-720, 720), 
        agg_total=direction, 
        timeunit=30)

    df_clm = df_temp.rename(columns={internal_clm_name: clm_name})

    if direction == "max":
        df_clm[clm_name] = df_clm[clm_name].map(lambda x: 1 if x > thresh else 0)
    else:
        df_clm[clm_name] = df_clm[clm_name].map(lambda x: 1 if x < thresh else 0)

    lst_clms_pesi_vitals.append(df_clm)

lst_clms_pesi_vitals_indexed = [df.set_index("patientunitstayid") for df in lst_clms_pesi_vitals]
df_final_vitals_pesi = pd.concat(lst_clms_pesi_vitals_indexed, axis=1)


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:12<00:00,  3.24s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.69s/it]


In [43]:
# Merge the results dataframe to the growing final dataframe
df_pe = df_pe.merge(
    right=df_final_vitals_pesi,
    how="left",
    on="patientunitstayid"
)

## Time-to-death

In [44]:
df_ttd_pe = time_to_death_from_unit_admission(df_pat=df_pat, 
                                              df_apache_res=df_apacheresult, 
                                              lst_ids=lst_secondary_pe)

In [45]:
df_pe = df_pe.merge(
    right=df_ttd_pe,
    how="left",
    on="patientunitstayid"
)

## Time to PE

In [46]:
df_pe["time_to_PE_diag"] = df_pe.patientunitstayid.map(dict_sec_pe_times)
df_pe.time_to_PE_diag = df_pe.time_to_PE_diag.map(lambda x: 0 if x<0 else x)

## Admission Diagnosis / from OR

In [47]:
df_admdx = pd.read_csv("sec_PE_data/admissiondx_sec_PE.csv", low_memory=False)

df_admdx_long = get_admissionDx_basicinfo_longformat(df_admdx, lst_secondary_pe)

df_pe = df_pe.merge(
    right=df_admdx_long,
    how="left",
    on="patientunitstayid"
)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 34.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 36.36it/s]


# PESI score calculation

In [48]:
## PESI score and PESI classes  
#calculate the PESI scores
df_pe[f"PESI_score"] = df_pe.apply(lambda row: row["age"] + row["PESI_gender"]*10 +
                                          row["PESI_cancer"]*30 + row["PESI_hf"]*10 +
                                          row["PESI_pulm"]*10 + row[f"PESI_heartrate"]*20 +
                                          row[f"PESI_systolic"]*30 + row[f"PESI_respiration"]*20 +
                                          row[f"PESI_temperature"]*20 + row[f"PESI_ams"]*60 +
                                          row[f"PESI_sao2"]*20, axis=1) 


# calculate the PESI classes
df_pe[f"PESI_class"] = np.nan
df_pe.loc[df_pe[f"PESI_score"] > 125, [f"PESI_class"]] = 5
df_pe.loc[(df_pe[f"PESI_score"] >= 106) & (df_pe[f"PESI_score"] <= 125), [f"PESI_class"]] = 4
df_pe.loc[(df_pe[f"PESI_score"] >= 86) & (df_pe[f"PESI_score"] <= 105), [f"PESI_class"]] = 3
df_pe.loc[(df_pe[f"PESI_score"] >= 66) & (df_pe[f"PESI_score"] <= 85), [f"PESI_class"]] = 2
df_pe.loc[df_pe[f"PESI_score"] < 66, [f"PESI_class"]] = 1

# sPESI score calculation

In [49]:
# sPESI scores 
df_pe[f"sPESI_score"] = df_pe["sPESI_age"] + df_pe["PESI_cancer"] + df_pe["sPESI_cardiopulm"] + \
                            df_pe[f"PESI_heartrate"] + df_pe[f"PESI_systolic"] + df_pe[f"PESI_sao2"]

# ICU-sPESI calculation

In [50]:
# ICU-sPESI scores 
df_pe[f"ICU_sPESI_score"] = (df_pe[f"sPESI_score"] + df_pe[f"PESI_ams"] + df_pe[f"ICU_sPESI_infusion_vaso_ino"] + df_pe[f"ICU_sPESI_intub"]) 

# Patient exclusion

In [51]:
df_pe.shape

(1276, 72)

&rarr; 1276 patients total

In [52]:
df_pe = df_pe[df_pe.time_to_PE_diag <= (48 * 60)].copy()

In [53]:
df_pe.shape

(1121, 72)

&rarr; 1121 with a secondary PE diagnosis in the first 48h after ICU admission

In [54]:
# excluding patients with missing data in the APACHE-IV sore, regarding in-hospital mortality, gender
df_pe = df_pe.dropna(subset=["gender", "apachescore", "hospitaldischargestatus", "apacheadmissiondx"])

In [55]:
df_pe.shape

(905, 72)

&rarr; 216 patients excluded due to missing data

In [56]:
# excluding patients with an age < 18
df_pe = df_pe[df_pe['age'] >= 18].copy()

In [57]:
df_pe.shape

(905, 72)

&rarr; 0 patients excluded due to age < 18 years old

In [58]:
# excluding patient with admission diagnoses indicative of primary PE diagnosis
lst_dx_to_drop = [
    "Thrombus, arterial", "Thrombectomy (with general anesthesia)", "Embolectomy (with general anesthesia)", "", "Thrombosis, vascular (deep vein)", "Hemorrhage/hemoptysis, pulmonary",
    "Shock, cardiogenic", "Chest pain, atypical (noncardiac chest pain)", "Chest pain, respiratory", "Chest pain, unknown origin", "Vena cava filter insertion",
]
df_pe = df_pe[~df_pe.apacheadmissiondx.isin(lst_dx_to_drop)].copy()

In [59]:
df_pe.shape

(812, 72)

&rarr; 93 Patients excluded due to primary diagnoses indicative of primary PE

&rarr; Final cohort of 812 patients with secondary PE

# Further data processing

## Functions


In [60]:
def map_pmh_cancer(x):
    """ mapping the raw cancer string data to cancer sites and then cancer groups """
    # if the patient does not have cancer
    if x == 0:
        return 0
    
    # extract the specific cancer site
    if "Cancer-Primary Site/" in x:
        lst_pmh_cancer = x.split("|")
        lst_site = [i for i in lst_pmh_cancer if "Cancer-Primary Site/" in i]

        if len(lst_site) > 1:
            cancer_site = "multiple"

        else:
            site_description = lst_site[0]
            lst_final = site_description.split("/")
            cancer_site = lst_final[1]

    else:
        cancer_site = "other"
    
    # group the cancer based on cancer site
    # the keys of the dictionary represent all unique values for this column for this cohort
    dict_cancer_sites = {
        0: "No_cancer",
        'none': "No_cancer",
        'other': "other",
        'breast': "Breast",
        'lung': "Respiratory",
        'colon': "GI",
        'prostate': "Genitourinary",
        'uterus': "Genitourinary",
        'bladder': "Genitourinary",
        'pancreas - adenocarcinoma': "GI",
        'melanoma': "other",
        'brain': "other",
        'kidney': "Genitourinary",
        'ovary': "Genitourinary",
        'esophagus': "GI",
        'bile duct': "GI",
        'liver': "GI",
        'multiple': "other",
        'bone': "other",
        'head and neck': "other",
        'sarcoma': "other",
        'unknown': "other",
        'testes': "Genitourinary",
        'stomach': "GI"
    }
    
    cancer_group = dict_cancer_sites[cancer_site]

    return cancer_group

## Data processing

In [61]:
df_pe.gender = df_pe.gender.map({"Female":0, "Male":1})

In [62]:
# Binarize certain PMH columns
lst_clms_binary = ["pmh_HT_with_treatment", "pmh_MI", "pmh_angina", "pmh_strokes", "pmh_periph_vasc_disease", "pmh_CA_bypass",
                  "pmh_PCI", "pmh_pacemaker", "pmh_AICD", "pmh_venous_thrombosis", "pmh_asthma",
                  "pmh_CHF", "pmh_restrictive_lung_disease", "pmh_card_valvular", 'pmh_home_o2', 'pmh_seizures', 'pmh_dementia', 'pmh_neuromusk_disease',
                  'pmh_intracranial_mass', 'pmh_liver_cirrhosis']

for i in lst_clms_binary:
    df_pe.loc[(df_pe[i] != 0) & (df_pe[i].notna()), i] = 1
    
    
dict_clms_to_binary = {
    "pmh_cancer": "pmh_cancer_binary",
    "pmh_insulin_dep_DM" : "pmh_diabetes_binary",
    "pmh_COPD": "pmh_COPD_binary",
    "pmh_arrhythmias": "pmh_arrhythmias_binary",
    "pmh_renal_insuff": "pmh_renal_insuff_binary",
    "pmh_renal_failure": "pmh_renal_failure_binary"   
}

for orig_clm, new_clm in dict_clms_to_binary.items():
    df_pe[new_clm] = df_pe[orig_clm].map(lambda x: 1 if (x!=0 and x!=np.nan) else 0)


In [63]:
# group the cancer column depending on the site of the cancer (for details see function above)
df_pe["pmh_cancer_grouped"] = df_pe["pmh_cancer"].map(lambda x: map_pmh_cancer(x))

In [64]:
# grouping the diabetes column depending on the type of diabetes (insulin dependent, only medication dependent and without any medication)
def map_pmh_diabetes(x):
    """ process the raw string data of the pmh_insulin_dep_DM column """
    if x == 0:
        return 0

    if "non-medication" in x:
        return "dm_without_treatment"

    if x == "medication dependent":
        return "medication_only"

    if "insulin" in x:
        return "including_Insulin"

df_pe["pmh_diabetes"] = df_pe["pmh_insulin_dep_DM"].map(lambda x: map_pmh_diabetes(x))

In [65]:
# grouping the COPD column depending on the severity of the COPD
# the keys of the dictionary represent all unique values for this column for this cohort
dict_pmhCOPD = {
    0: 0,
    "COPD  - moderate": "COPD_moderate",
    "COPD  - no limitations": "COPD_mild",
    "COPD  - severe": "COPD_severe",
    "COPD  - moderate|COPD  - severe": "COPD_severe",
    "COPD  - no limitations|COPD  - severe": "COPD_severe",
    "COPD severe": "COPD_ severe"
}

df_pe["pmh_COPD"] = df_pe["pmh_COPD"].map(dict_pmhCOPD)

In [66]:
# grouping the cardiac arrythmias column depending on whether the recorded arrythmias included atrial fibrillation or not
def map_pmh_arrhythmias(x):
    if x == 0:
        return 0

    if "atrial fibrillation" in x:
        return "Afib_orwith"

    else:
        return "other_arrhythmia"


df_pe["pmh_arrhythmias"] = df_pe["pmh_arrhythmias"].map(lambda x: map_pmh_arrhythmias(x))

In [67]:
# grouping the COPD column depending on whether the patients were on dialysis or not
# the keys of the dictionary represent all unique values for this column for this cohort
dict_renal_failure = {
    0: 0,
    "renal failure - hemodialysis": "renal_fail_w_dialysis",
    "renal failure- not currently dialyzed": "renal_fail_no_dialysis",
    "renal failure - peritoneal dialysis": "renal_fail_w_dialysis"
}

df_pe["pmh_renal_failure"] = df_pe["pmh_renal_failure"].map(dict_renal_failure)

In [68]:
# grouping the previous PE column depending on whether it was a single or multiple previous PE
def map_pmh_PE(x):
    if x == 0:
        return 0

    if "multiple" in x:
        return "multiple_PE"

    else:
        return "single_PE"

df_pe["pmh_PE"] = df_pe["pmh_PE"].map(lambda x: map_pmh_PE(x))

In [69]:
### grouping other comorbidities
# Coronary artery disease and other large vessel disease: myocardial infarction, angina, strokes, peripheral vascular disease, 
# coronary artery bypass, percutaneous coronary intervention
df_pe["pmh_CAD_and_other_large_vessel"] = df_pe.apply(lambda x: 1 if x.pmh_MI == 1 or x.pmh_angina == 1 or x.pmh_strokes == 1 or x.pmh_periph_vasc_disease == 1 
                                                      or x.pmh_CA_bypass == 1 or x.pmh_PCI == 1 else 0, axis=1)

# pacemaker: either a normal pacemaker or an AICD
df_pe["pmh_any_pacemaker"] = df_pe.apply(lambda x: 1 if x.pmh_pacemaker == 1 or x.pmh_AICD == 1 else 0, axis=1)

# Venous thomboses & PE
df_pe["pmh_venous_thromb_and_PE"] = df_pe.apply(lambda x: 1 if x.pmh_venous_thrombosis == 1 or (x.pmh_PE!=0 and x.pmh_PE!=np.nan) else 0, axis=1)

# obstructive lung disease: COPD and asthma
df_pe["pmh_obstructive_LD"] = df_pe.apply(lambda x: 1 if (x.pmh_COPD!=0 and x.pmh_COPD!=np.nan) or x.pmh_asthma == 1 else 0, axis=1)

# arrythmias & pacemakers
df_pe["pmh_cardiac_arrythmias_incl_pacemakers"] = df_pe.apply(lambda x: 1 if (x.pmh_arrhythmias_binary != 0 or 
                                                                              x.pmh_any_pacemaker != 0) else 0, axis=1)

# obstructive + restrictive LD
df_pe["pmh_obs_restr_LD"] = df_pe.apply(lambda x: 1 if (x.pmh_obstructive_LD != 0 or 
                                                        x.pmh_home_o2 != 0 or 
                                                        x.pmh_restrictive_lung_disease != 0) else 0, axis=1)

# All neuro
df_pe["pmh_neuro_grouped"] = df_pe.apply(lambda x: 1 if (x.pmh_seizures != 0 or 
                                                         x.pmh_dementia != 0 or 
                                                         x.pmh_intracranial_mass != 0 or 
                                                         x.pmh_neuromusk_disease != 0) else 0, axis=1)


In [70]:
## grouping the admission diagnoses
# the surgical admission diagnoses
df_pe["admdx_group_descriptive"] = np.nan

df_pe["admdx_group_descriptive"] = df_pe["admdx_group_descriptive"].astype(object)

df_pe.loc[df_pe.admdx_admitted_from_OR == "Yes", "admdx_group_descriptive"] = "surgery_other"
df_pe.loc[df_pe.admdx_op_organsystem == "Gastrointestinal", "admdx_group_descriptive"] = "surgery_GI"
df_pe.loc[df_pe.admdx_op_organsystem == "Neurologic", "admdx_group_descriptive"] = "surgery_neuro"
df_pe.loc[df_pe.admdx_op_organsystem == "Cardiovascular", "admdx_group_descriptive"] = "surgery_card_vasc"
df_pe.loc[df_pe.admdx_op_organsystem == "Respiratory", "admdx_group_descriptive"] = "surgery_respiratory"

In [71]:
# non-surgical admission diagnoses
# Sepsis
df_pe.loc[df_pe.apacheadmissiondx.str.contains("sepsis", regex=False, case=False), "admdx_group_descriptive"] = "non_surg_sepsis"

# Whole organ systems
df_pe.loc[df_pe.admdx_non_op_organsystem == "Genitourinary", "admdx_group_descriptive"] = "non_surg_genitourinary_all"
df_pe.loc[df_pe.admdx_non_op_organsystem == "Hematology", "admdx_group_descriptive"] = "non_surg_hematology_all"
df_pe.loc[df_pe.admdx_non_op_organsystem == "Metabolic/Endocrine", "admdx_group_descriptive"] = "non_surg_endocrine_all"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Musculoskeletal/Skin") | (df_pe.admdx_non_op_organsystem == "Trauma"), "admdx_group_descriptive"] = "non_surg_msk_trauma_all"

# GI
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Gastrointestinal") & 
          (df_pe.apacheadmissiondx.str.contains("bleeding", regex=False, case=False) | (df_pe.apacheadmissiondx.str.contains("hemorrhage", regex=False, case=False))), "admdx_group_descriptive"] = "non_surg_GI_bleed"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Gastrointestinal") & (df_pe.admdx_group_descriptive != "non_surg_GI_bleed"), "admdx_group_descriptive"] = "non_surg_GI_other"

# Neuro
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Neurologic") & 
          df_pe.apacheadmissiondx.isin(["CVA, cerebrovascular accident/stroke", "Hemorrhage/hematoma, intracranial", "Hematoma, subdural"]), "admdx_group_descriptive"] = "non_surg_neuro_stroke_bleed"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Neurologic") & (df_pe.admdx_group_descriptive != "non_surg_neuro_stroke_bleed"), "admdx_group_descriptive"] = "non_surg_neuro_other"

# Respiratory
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Respiratory") & 
          df_pe.apacheadmissiondx.str.contains("pneumonia", regex=False, case=False), "admdx_group_descriptive"] = "non_surg_resp_pneumonia"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Respiratory") & (df_pe.apacheadmissiondx == "Emphysema/bronchitis"), "admdx_group_descriptive"] = "non_surg_resp_emphys_bronchitis"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Respiratory") & (df_pe.apacheadmissiondx == "Arrest, respiratory (without cardiac arrest)"), "admdx_group_descriptive"] = "non_surg_resp_resparrest"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Respiratory") & (df_pe.apacheadmissiondx == "ARDS-adult respiratory distress syndrome, non-cardiogenic pulmonary edema"), "admdx_group_descriptive"] = "non_surg_resp_ards"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Respiratory") & (df_pe.admdx_group_descriptive.isna()), "admdx_group_descriptive"] = "non_surg_resp_other"

# Cardivascular
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Cardiovascular") & 
          df_pe.apacheadmissiondx.isin(["Cardiac arrest (with or without respiratory arrest; for respiratory arrest see Respiratory System)", 
                                        "Infarction, acute myocardial (MI)", "Angina, unstable (angina interferes w/quality of life or meds are tolerated poorly)", 
                                        "MI admitted > 24 hrs after onset of ischemia", "Angina, stable (asymp or stable pattern of symptoms w/meds)"]), 
          "admdx_group_descriptive"] = "non_surg_cardio_mi_arrest_angina"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Cardiovascular") & (df_pe.apacheadmissiondx == "CHF, congestive heart failure"), "admdx_group_descriptive"] = "non_surg_cardio_chf"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Cardiovascular") & 
          df_pe.apacheadmissiondx.str.contains("Rhythm disturbance", case=False, regex=False), "admdx_group_descriptive"] = "non_surg_cardio_arrythmia"
df_pe.loc[(df_pe.admdx_non_op_organsystem == "Cardiovascular") & (df_pe.admdx_group_descriptive.isna()), "admdx_group_descriptive"] = "non_surg_cardio_other"

In [72]:
# extract organ system
def extract_organ_system(row):
    parts = row.split('_')
    # Check if the string starts with 'non_surg' or 'surgery'
    if parts[0] == 'non':
        return parts[2]
    elif parts[0] == 'surgery':
        return "surgery"
    else:

        return None

# Applying the function to the DataFrame
df_pe['admdx_organ_descriptive'] = df_pe['admdx_group_descriptive'].apply(extract_organ_system)

# Export 


In [73]:
df_pe.to_excel("sec_PE_data/secondary_PE_final.xlsx", index=False)