In [2]:
# Import the necessary python modules
import pandas as pd # v.1.4.3
import numpy as np # v.1.4.3

from tqdm import tqdm # v.4.65.0

import math # python v.3.9.0

# Identifying the patient population

To identify the patients with community aquired pneumonia (CAP), we selected for patients with pneumonia/pulmonary sepsis as their main admission diagnosis and with an ICU-admission <24h hours after hospital admission and who were admitted to the hospital either directly, via the emergency department, acute care/floor or the chest pain center

In [3]:
df_pat_all = pd.read_csv("../eICU_data/patient.csv", low_memory=False)

In [4]:
lst_pneumonias = [i for i in df_pat_all.apacheadmissiondx.value_counts().index.to_numpy() if "Pneumonia" in i]
lst_pneumonias

['Pneumonia, bacterial',
 'Pneumonia, aspiration',
 'Pneumonia, other',
 'Pneumonia, viral',
 'Pneumonia, fungal',
 'Pneumonia, parasitic (i.e., Pneumocystic pneumonia)']

In [5]:
[i for i in df_pat_all.apacheadmissiondx.value_counts().index.to_numpy() if "Sepsis" in i]

['Sepsis, pulmonary',
 'Sepsis, renal/UTI (including bladder)',
 'Sepsis, GI',
 'Sepsis, unknown',
 'Sepsis, cutaneous/soft tissue',
 'Sepsis, other',
 'Sepsis, gynecologic']

In [6]:
lst_pneumonias.append('Sepsis, pulmonary')

In [7]:
pat_pneumonia = df_pat_all.loc[df_pat_all.apacheadmissiondx.isin(lst_pneumonias)
                               & (df_pat_all.hospitaladmitoffset > (-60 * 24))
                               & (df_pat_all.hospitaladmitsource.isin(["Emergency Department", "Direct Admit", "Floor", "Acute Care/Floor", "Chest Pain Center"]) | df_pat_all.hospitaladmitsource.isna())
                               , :].copy()


In [8]:
lst_pat_pneumonia = list(pat_pneumonia.patientunitstayid.unique())
len(lst_pat_pneumonia)

11317

&rarr; 11317 ICU admissions with CAP

In [9]:
# initiate the dataframe to which all the extracted information will be merged
df_info = pd.DataFrame({"patientunitstayid": lst_pat_pneumonia})

# Reducing the file-size

## Function

In [10]:
def make_reduced_files(import_path, export_path, notation, lst_ids):
    """
    Takes an import path (folder where all the eICU files are), an export path, a "notation" (will be added behind the file name as an identifier) and a list of ids, and then returns the reduced
    files of eICU for only that population (except for the hospital table).

    :param import_path: String - Folder where all the eICU data is stored.
    
    :param export_path: String - Should only include the folder, not the filename, and should end with "/".

    :param notation: String - Short notation that will be added to each file name for future identification.

    :param lst_ids: List - List of target population patientunitstayids.

    :return: Saves a list of abbreviated dataframes to the specified export_path.
    """

    # List of all tables in the eICU database, except the hospital table as it is not connected to patientunitstayid
    lst_tables = ["admissionDrug", "admissionDx", "allergy", "apacheApsVar", "apachePatientResult", "apachePredVar",
                  "carePlanCareProvider", "carePlanEOL", "carePlanGeneral", "carePlanGoal", "carePlanInfectiousDisease", "customLab",
                  "diagnosis", "infusionDrug", "intakeOutput", "lab", "medication", "microLab", "note",
                  "nurseAssessment", "nurseCare", "nurseCharting", "pastHistory", "patient", "physicalExam",
                  "respiratoryCare", "respiratoryCharting", "treatment", "vitalAperiodic", "vitalPeriodic"]

    # Looping over all tables, selecting only the data pertaining to the cohort of interest and then saving these files in a specified location
    for table in tqdm(lst_tables):
        df_chunk = pd.read_csv(f"{import_path}{table}.csv", chunksize=100000, low_memory=False)
        lst_dataframes = []
        
        for chunk in df_chunk:
            df_temp = chunk[chunk["patientunitstayid"].isin(lst_ids)]
            lst_dataframes.append(df_temp)

        df_small = pd.concat(lst_dataframes)
        df_small.to_csv(f"{export_path}{table}_{notation}.csv", index=False)

## Implementation

In [11]:
#make_reduced_files(
#    import_path = "../eICU_data/",
#    export_path = "data_pneumonia_doac/",
#    notation = "pn_doac",
#    lst_ids = lst_pat_pneumonia
#)

To make this code work in your environment, the complete unpacked eICU data has to be located at the relative path "../eICU_data/" in your project.

Now all of the data for the CAP patient cohort is in the folder at the relative path "data_pneumonia_doac/" and can be accessed from there.


# Data extraction

## Functions

In [12]:
def get_basic_patient_info(df_pat, lst_ids):
    """
    Takes the patient dataframe of the eICU database, a list of target patientunitstayids and
    returns a dataframe with "cleaned" info per patientunitstayid.

    :param df_pat: DataFrame - Patient dataframe of eICU (or abbreviated). 

    :param lst_ids: List - List of the population patientunitstayids.

    :return: DataFrame with the information.
    """

    lst_columns = ["patientunitstayid", "uniquepid", "gender", "age", "hospitalid", "wardid",
                   "unittype", "apacheadmissiondx", "hospitaldischargestatus",
                   'hospitaladmittime24', "hospitaladmitsource", "hospitaldischargelocation", 'unitadmittime24',
                   'unitadmitsource', 'unitstaytype', 'dischargeweight', 'unitdischargelocation', 'unitdischargestatus', "unitvisitnumber"]

    # reducing the general pat_df to reduce the computational load
    df_temp = df_pat.loc[df_pat["patientunitstayid"].isin(lst_ids), lst_columns].copy()

    # gender column
    df_temp["gender"] = df_temp["gender"].replace({"Unknown": np.nan, "Other": np.nan})

    # age column
    df_temp["age"] = df_temp["age"].replace("> 89", "90")
    df_temp["age"] = pd.to_numeric(df_temp["age"])
    
    df_final = df_temp[["patientunitstayid", "gender", "age", "apacheadmissiondx","hospitaldischargestatus", "unitdischargestatus", 
                        "hospitaladmitsource", "unitadmitsource", "unitdischargelocation"]].copy()

    return df_final

In [13]:
def apply_icu_free_days(x, df_pat_ref):
    ### this function is an apply function working row-wise. As one row represents one ICU-admission, this function only works on 1 ICU admission at a time
    # variables to determine the other possible stays
    uniquepid = x.uniquepid
    visitnumber = x.unitvisitnumber
    hospid = x.hospitalid
    hospadmittime = x.hospitaladmittime24
    hospadmitsource = x.hospitaladmitsource
    hosp_year = x.hospitaldischargeyear
    hospdischargetime = x.hospitaldischargetime24

    # whether the patient died in the ICU/hospital, binary
    unit_death = x.unitdischargestatus
    hospital_death = x.hospitaldischargestatus

    # Times in minutes from this unitadmission
    hosp_discharge = x.hospitaldischargeoffset
    orig_unit_discharge = x.unitdischargeoffset

    # minutes after hospitaladmission that the patient was discharged. Negative as hospitaladmitoffset is negative to begin with!
    dischargeoffset_from_hosp_admission = x.hospitaladmitoffset - x.unitdischargeoffset

    # Number of minutes in 30 days, from the hospitaladmission. Negative as hospitaladmitoffset is negative to begin with!
    maximum_hospoffset = x.hospitaladmitoffset - (60 * 24 * 30)

    ### Looking for ICU-stays in the same hospital-stay after the current ICU-stay but starting before 30d from this ICU admission.
    # the data has to be from the same patient (same uniquepid), has to be from the same hospital-stay overall which is identified by
    # several several variables (hospitalid, hospitaladmittime etc.), has to be a visit after the current unitvisit (therefore higher visitnumber) and
    # has to happen after the patient was discharged from the first visit (so the hospitaladmitoffset has to be smaller (as it is negative) than
    # the minutes after hospitaladmission that the patient was discharged) and also has to happen before 30 days after the current ICU admission (so
    # the hospitaladmitoffset has to be larger (as these are negative numbers) than 30 days after the current ICU-stays admission)
    unique_ref_data = df_pat_ref.loc[(df_pat_ref.uniquepid == uniquepid) &
                                     (df_pat_ref.hospitalid == hospid) &
                                     (df_pat_ref.hospitaladmittime24 == hospadmittime) &
                                     (df_pat_ref.hospitaladmitsource == hospadmitsource) &
                                     (df_pat_ref.hospitaldischargeyear == hosp_year) &
                                     (df_pat_ref.hospitaldischargetime24 == hospdischargetime) &

                                     (df_pat_ref.unitvisitnumber > visitnumber) &
                                     (df_pat_ref.hospitaladmitoffset < dischargeoffset_from_hosp_admission) &
                                     (df_pat_ref.hospitaladmitoffset > maximum_hospoffset), :].copy().sort_values(
        "hospitaladmitoffset", ascending=False)

    # If there are stays after this ICU stay, save the corresponding admission-offsets and unit-durations as a list of tuples
    if len(unique_ref_data) > 0:
        lst_other_stays = list(zip(unique_ref_data["hospitaladmitoffset"], unique_ref_data["unitdischargeoffset"]))

    # minutes per day
    minutes_d = 60 * 24

    # if the patient dies during the current ICU admission
    if unit_death == 1:
        return 0

    # if the patient does not have any additional ICU-stays after this one
    if len(unique_ref_data) == 0:

        # if the patient has no other ICU admissions and does not dies in the hospital
        if hospital_death == 0:
            icu_free_days = ((30 * minutes_d) - orig_unit_discharge) / minutes_d

            if icu_free_days < 0:
                return 0
            else:
                return icu_free_days

        # if the patient has no other ICU admissions and dies in the hospital
        if hospital_death == 1:

            # if the patient has no other ICU admissions and dies in the hospital before the 30 days
            if hosp_discharge < (30 * minutes_d):
                icu_free_days = (hosp_discharge - orig_unit_discharge) / minutes_d

                if icu_free_days < 0:
                    return 0
                else:
                    return icu_free_days

            # if the patient has no other ICU admissions and dies in the hospital after the 30 days
            else:
                icu_free_days = ((30 * minutes_d) - orig_unit_discharge) / minutes_d

                if icu_free_days < 0:
                    return 0
                else:
                    return icu_free_days

    # if the patient has other ICU-stays after this one
    else:
        # calculate the end offset of the last ICU-stay (in minutes) from the current ICU admission
        # There is no additional ICU-stay starting after 30d from the original ICU admission as we selected for this in the
        # unique_ref_data above. Therefore, if a stay extends beyond the 30d, it will be the last one
        last_stay_offset, last_stay_duration = lst_other_stays[-1]
        end_last_stay = -1 * last_stay_offset + last_stay_duration + x.hospitaladmitoffset  # hospitaladmitoffsets are negative!

        # if the last stay ends after the 30 days, adjust the duration of the last stay to end at the exact 30d mark
        # (to streamline for calculations below)
        if end_last_stay > (30 * minutes_d):
            # calculate the duration that will simulate an end of the last ICU stay at 30d from the current ICU stay
            dur_for_lst_stay_max_30d = ((30 * minutes_d) - x.hospitaladmitoffset) - (-1 * lst_other_stays[-1][0])

            lst_other_stays[-1] = (lst_other_stays[-1][0], dur_for_lst_stay_max_30d)

        # calculate the durations of the other ICU stays
        duration_all_other_stays = sum([tup[1] for tup in lst_other_stays])

        # if the patient has other ICU-stays after this one and the last ICU discharge of this hospital-admission is after 30 days
        if end_last_stay > (30 * minutes_d):
            icu_free_days = ((30 * minutes_d) - orig_unit_discharge - duration_all_other_stays) / minutes_d

            if icu_free_days < 0:
                return 0
            else:
                return icu_free_days

        # if the patient has other ICU-stays after this one and the last ICU discharge of this hospital-admission is before 30 days
        else:
            # if the patient has other ICU-stays after this one, the last ICU discharge of this hospital-admission is before 30 days
            # and the patient dies in the hospital
            if hospital_death == 1:

                # if the patient has other ICU-stays after this one, the last ICU discharge of this hospital-admission is before 30 days
                # and the patient dies in the hospital before 30 days
                if hosp_discharge < (30 * minutes_d):
                    icu_free_days = (hosp_discharge - orig_unit_discharge - duration_all_other_stays) / minutes_d

                    if icu_free_days < 0:
                        return 0
                    else:
                        return icu_free_days

                # if the patient has other ICU-stays after this one, the last ICU discharge of this hospital-admission is before 30 days
                # and the patient dies in the hospital after 30 days
                else:
                    icu_free_days = ((30 * minutes_d) - orig_unit_discharge - duration_all_other_stays) / minutes_d

                    if icu_free_days < 0:
                        return 0
                    else:
                        return icu_free_days

            # if the patient has other ICU-stays after this one, the last ICU discharge of this hospital-admission is before 30 days
            # and the patient does not dies in the hospital
            if hospital_death == 0:
                icu_free_days = ((30 * minutes_d) - orig_unit_discharge - duration_all_other_stays) / minutes_d

                if icu_free_days < 0:
                    return 0
                else:
                    return icu_free_days

    return np.nan


def get_icu_free_days(df_pat, lst_ids):
    """
    Takes the patient dataframe and a list of patientunitstasyids and returns a dataframe with the ICU-free-days (out of 30 days)

    :param df_pat: Dataframe - Patient dataframe of eICU (or abbreviated)

    :param lst_ids: List - list of populations patientunitstayids

    :return: Dataframe (name of the result column: "ICU_free_days")

    """

    # reducing the general df to reduce the computational load
    df_stays_ICU_free_d = df_pat.loc[
        (df_pat.patientunitstayid.isin(lst_ids)),
        ["patientunitstayid", "hospitaldischargestatus",
         "unitdischargestatus", "uniquepid", "unitvisitnumber", "hospitalid", "hospitaladmittime24",
         "hospitaladmitsource", "hospitaldischargetime24", "hospitaldischargeyear", "hospitaladmitoffset",
         "unitdischargeoffset", "hospitaldischargeoffset"]].copy()

    # changing the death-columns to an int-binary system to prepare for the calculation
    df_stays_ICU_free_d.unitdischargestatus = df_stays_ICU_free_d.unitdischargestatus.map({"Alive": 0, "Expired": 1})
    df_stays_ICU_free_d.hospitaldischargestatus = df_stays_ICU_free_d.hospitaldischargestatus.map(
        {"Alive": 0, "Expired": 1})

    # get a list of all the patient-ids (!= patientunitstayids) and reduce the patient dataframe by them (creating a reference df)
    # Importantly, this will also include any other ICU-admissions of the same patient beside the current ICU-admissions
    lst_uniqueids = list(df_stays_ICU_free_d.uniquepid.unique())
    df_ref_for_map_ICU_free_d = df_pat[df_pat.uniquepid.isin(lst_uniqueids)].copy()

    # calculate the ICU free days
    df_stays_ICU_free_d["ICU_free_days"] = df_stays_ICU_free_d.apply(
        lambda x: apply_icu_free_days(x, df_ref_for_map_ICU_free_d), axis=1)

    df_final = df_stays_ICU_free_d[["patientunitstayid", "ICU_free_days"]].copy()

    return df_final

In [14]:
def group_pmh_subcat(x, df, clm_name):
    df_one_id = df.loc[(df.patientunitstayid == x), [clm_name]]
    lst_diagnoses = df_one_id[clm_name].unique().tolist()
    lst_diagnoses = [str(i) for i in lst_diagnoses if i is not None]
    final_string = "|".join(lst_diagnoses)
    return final_string

def apply_split_and_rejoin_for_output(str_pmh):
    lst_strings = str_pmh.split("/")
    lst_final = lst_strings[6:]

    if len(lst_final) == 1:
        return lst_final[0]

    if len(lst_final) > 1:
        joined = "/".join(lst_final)
        return joined

def get_pastHistory(df_pmh, lst_ids):
    """
    Receives the pastHistory Dataframe from eICU or abbreviated and a list of target patientunitstayids and returns
    a kind of longformat dataframe with the PMH.

    :param df_pmh: DataFrame - pastHistory dataframe from eICU. 

    :param lst_ids: List - list of patientunitstayids from the target population.

    :return: DataFrame with patientunitstayids as the index and columns for each category of PMH
    """
    
    # Dictionary of prospective column names (key) and the string keys to the PMH-string-path of each disease (values)
    dict_subcat_clms = {
        'pmh_HT_with_treatment': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Hypertension Requiring Treatment'],
        'pmh_cancer': ['notes/Progress Notes/Past History/Organ Systems/Hematology-Oncology', 'Cancer'],
        'pmh_non_insulin_dep_DM': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Non-Insulin Dependent Diabetes'],
        'pmh_COPD': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'COPD'],
        'pmh_CHF': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Congestive Heart Failure'],
        'pmh_insulin_dep_DM': ['notes/Progress Notes/Past History/Organ Systems/Endocrine', 'Insulin Dependent Diabetes'],
        'pmh_arrhythmias': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Arrhythmias'],
        'pmh_MI': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Myocardial Infarction'],
        'pmh_strokes': ['notes/Progress Notes/Past History/Organ Systems/Neurologic', 'Strokes'],
        'pmh_renal_insuff': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Renal Insufficiency'],
        'pmh_PCI': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Procedural Coronary Intervention'],
        'pmh_card_valvular': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Valve disease'],
        'pmh_asthma': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Asthma'],
        'pmh_renal_failure': ['notes/Progress Notes/Past History/Organ Systems/Renal', 'Renal Failure'],
        'pmh_CA_bypass': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Coronary Artery Bypass'],
        'pmh_home_o2': ['notes/Progress Notes/Past History/Organ Systems/Pulmonary', 'Home Oxygen'],
        'pmh_venous_thrombosis': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Venous Thrombosis'],
        'pmh_angina': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Angina'],
        'pmh_PE': ['notes/Progress Notes/Past History/Organ Systems/Cardiovascular', 'Pulmonary Embolism'],
        'pmh_mmunosuppression_last_6m': ['notes/Progress Notes/Past History/Organ Systems/Infectious Disease', 'Immunosuppression within past 6 months'],
    }
    
    # reducing the general pastHistory df to reduce the computational load
    df_temp = df_pmh.loc[df_pmh["patientunitstayid"].isin(lst_ids), ["patientunitstayid", "pasthistorypath", "pasthistoryvalue"]].copy()

    # Replace substrings in pasthistorypath (which would cause issues due to regex expressions later on)
    df_temp["pasthistorypath"] = df_temp["pasthistorypath"].str.replace("Hematology/Oncology", "Hematology-Oncology",
                                                                        regex=True)
    # list for the future columns
    lst_columns = []

    # Iterate over the individual diseases and check whether a patient has this PMH or not then saving this as a column to the list
    for clm_name, key_phrases in tqdm(dict_subcat_clms.items()):
        key1 = key_phrases[0]
        key2 = key_phrases[1]

        df_new_clm_raw = df_temp.loc[(df_temp['pasthistorypath'].str.contains(key1, na=False)) &
                                     (df_temp['pasthistorypath'].str.contains(key2, na=False)), ["patientunitstayid", "pasthistorypath"]].copy().drop_duplicates()

        df_new_clm_raw["output_pmh_path"] = df_new_clm_raw["pasthistorypath"].apply(lambda x: apply_split_and_rejoin_for_output(x))

        df_new_clm_reference = df_new_clm_raw.copy()

        df_new_clm_raw[clm_name] = df_new_clm_raw["patientunitstayid"].apply(lambda x: group_pmh_subcat(x, df_new_clm_reference, "output_pmh_path"))

        df_pat_w_data = df_new_clm_raw.loc[:, ["patientunitstayid", clm_name]].copy().drop_duplicates()

        lst_pat_w_data = list(df_new_clm_raw.patientunitstayid.unique())
        lst_pat_no_data = [x for x in lst_ids if x not in lst_pat_w_data]

        df_pat_without_data = pd.DataFrame(lst_pat_no_data, columns=['patientunitstayid'])
        df_pat_without_data[clm_name] = 0

        df_clm_final = pd.concat([df_pat_w_data, df_pat_without_data])
        df_clm_final = df_clm_final.set_index("patientunitstayid")

        lst_columns.append(df_clm_final)

    # Concatenate all the columns and export this with the ids as another column  
    df_final = pd.concat(lst_columns, axis=1)
    df_final = df_final.reset_index()

    return df_final

In [15]:
def get_and_initial_clean_apacheApsVar(df_apsVar, lst_ids):
    """
    Receives the apacheApsVar Dataframe of the eICU database or abbreviated and a list of target patientunitstayids and
    returns a dataframe with initially "cleaned" data: The -1 Values in several columns which denote for "no data was
    entered" were set to np.nan. Additionally, a GCS column was added.

    :param df_apsVar: DataFrame - apacheApsVar Dataframe from eICU or abreviated. 

    :param lst_ids: List - list of target patientunitstayids.

    :return: DataFrame with patientunitstayid as the index and the data
    """
   
    # reducing the general df to reduce the computational load
    df_temp = df_apsVar.loc[df_apsVar["patientunitstayid"].isin(lst_ids), ['patientunitstayid','eyes', 'motor', 'verbal', 'meds']].copy()
    
    # set the missing data to NaN
    lst_clms = ['eyes', 'motor', 'verbal', 'meds']
    for clm in lst_clms:
        df_temp.loc[df_temp[clm] == -1, [clm]] = np.nan
    
    # create the GCS column
    df_temp["GCS"] = df_temp.eyes + df_temp.motor + df_temp.verbal

    # add the patients that did not have any data at all 
    lst_pat_with_data = list(df_temp.patientunitstayid.unique())
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]
    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=['patientunitstayid','eyes', 'motor', 'verbal', 'meds', "GCS"])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data
    df_final = pd.concat([df_temp, df_pat_without_data])
                       
    # Rename columns
    rename_clms = ['eyes', 'motor', 'verbal', 'meds', "GCS"]
    df_final = df_final.rename(columns={clm: "aps_{}".format(clm) for clm in rename_clms})

    return df_final

In [16]:
def get_cleaned_apachePredVar_basics(df_predvar, lst_ids):
    """
    This function takes the apachePredVar Dataframe (or abbreviated) and a list of patientunitstayids and returns a 
    cleaned Dataframe with the information for those unitstays. The dataframe will have the ids as index.

    :param df_predvar: DataFrame - apachePredVar Dataframe or abbreviated. 

    :param lst_ids: List - List of patientunitstayids of the target population.

    :return: DataFrame with patientunitstayid as the index and the data
    """
    
    # reducing the general df to reduce the computational load
    df_temp = df_predvar.loc[df_predvar["patientunitstayid"].isin(lst_ids), ['patientunitstayid', 'oobintubday1']].copy()
    
    # add the patients that did not have any data at all 
    lst_pat_with_data = list(df_temp.patientunitstayid.unique())
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]
    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=['patientunitstayid', 'oobintubday1'])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data
    df_final = pd.concat([df_temp, df_pat_without_data])
        
    # Rename columns
    rename_clms = ['oobintubday1']
    df_final = df_final.rename(columns={clm: "pred_{}".format(clm) for clm in rename_clms})

    return df_final

In [17]:
def get_cleaned_apachePatientResult_basics(df_apacheresult, lst_ids):
    """
    This function takes the apachePatientResult Dataframe (or abbreviated) and a list of patientunitstayids and returns a 
    cleaned Dataframe with the information for those unitstays. The dataframe will have the ids as index.

    :param df_apacheresult: DataFrame - apachePatientResult Dataframe or abbreviated. 

    :param lst_ids: List - List of patientunitstayids of the target population.

    :return: DataFrame with patientunitstayid as the index and the data
    """
    
    # reducing the general df to reduce the computational load
    df_temp = df_apacheresult.loc[(df_apacheresult["apacheversion"] == "IVa") & (df_apacheresult["patientunitstayid"].isin(lst_ids)),
                             ['patientunitstayid', 'apachescore','actualiculos', 'unabridgedhosplos', "unabridgedactualventdays"]].copy()

    # set the missing data to NaN
    lst_minus_ones = ['apachescore']
    for clm in lst_minus_ones:
        df_temp.loc[df_temp[clm] == -1, [clm]] = np.nan

    # add the patients that did not have any data at all 
    lst_pat_with_data = list(df_temp.patientunitstayid.unique())
    lst_pat_without_data = [x for x in lst_ids if x not in lst_pat_with_data]
    df_pat_without_data = pd.DataFrame(np.nan, index=[i for i in range(len(lst_pat_without_data))],
                                       columns=['patientunitstayid', 'apachescore','actualiculos', 'unabridgedhosplos', "unabridgedactualventdays"])
    df_pat_without_data['patientunitstayid'] = lst_pat_without_data
    df_final = pd.concat([df_temp, df_pat_without_data])

    return df_final


In [18]:
def get_infusion_drugs(df_infusion, lst_ids, additional_dict=None, timeframe=(0, 1440)):
    """
    This function returns a dataframe indicating whether patients received certain medication classes within a specified timeframe.

    :param df_infusion: DataFrame - infusionDrug Dataframe from eICU. 

    :param lst_ids: List - List of patientunitstayids of the target population.

    :param additional_dict: Dict - Additional/custom dictionary with the "clm_name": "drugname|drugname|drugname" format.

    :param timeframe: Tuple, default is (0, 1440) - (lower offset, upper offset), offset bounds.

    :return: DataFrame with the columns of who got which medication.
    """

    lower_offset, upper_offset = timeframe

    # reducing the general df to reduce the computational load
    df_reduced = df_infusion.query("patientunitstayid in @lst_ids and @lower_offset <= infusionoffset <= @upper_offset").copy()

    df_reduced.drugname = df_reduced.drugname.str.lower()

    dict_drugs = {
        "infusion_vaso_ino": 'epinephrine|adrenaline|norepinephrine|levophed|dobutamine|dobutrex|vasopressin|isoprotenerol|isuprel|phenylephrine|neo-synephrine|dopamine|milrinone'
    }

    if additional_dict is not None:
        dict_drugs.update(additional_dict)

    lst_columns = []
    
    # loop over the the drug groups and look at whether patients received this as an infusion or not
    for clm_name, drug_str in dict_drugs.items():

        lst_pat_drug = df_reduced.loc[df_reduced.drugname.str.contains(drug_str), "patientunitstayid"].copy().unique().tolist()
        lst_pat_wo_drug = [i for i in lst_ids if i not in lst_pat_drug]

        df_pat_w_data = pd.DataFrame(lst_pat_drug, columns=['patientunitstayid'])
        df_pat_w_data[clm_name] = 1

        df_pat_without_data = pd.DataFrame(lst_pat_wo_drug, columns=['patientunitstayid'])
        df_pat_without_data[clm_name] = 0

        df_clm_final = pd.concat([df_pat_w_data, df_pat_without_data])
        df_clm_final = df_clm_final.set_index("patientunitstayid")

        lst_columns.append(df_clm_final)

    df_final = pd.concat(lst_columns, axis=1)

    return df_final

In [19]:
def fast_clean_data(array_labs):
    """
    Discard outliers that are > median + 2 IQR or < median - 2 IQR

    :param lst_values: List - list of values (eg. mean BPs from a certain timeframe of a patient)

    :return: Array - numpy array of cleaned values

    """

    # to detect outliers 
    q25, q50, q75 = np.percentile(array_labs, [25, 50, 75])
    iqr = q75 - q25

    cleaned_array = array_labs[(array_labs >= (q50 - 2 * iqr)) & (array_labs <= (q50 + 2 * iqr))].copy()

    return cleaned_array


def group_labs_whole_timeframe(x, agg_type, realistic_bounds):
    np_labs = x.to_numpy()
    lower_realistic, upper_realistic = realistic_bounds

    if len(np_labs) == 1:
        if lower_realistic <= np_labs[0] <= upper_realistic:
            return np_labs[0]
        else:
            return np.nan

    if len(np_labs) <= 5:
        cleaned_labs = np_labs[(np_labs >= lower_realistic) & (np_labs <= upper_realistic)].copy()

        if len(cleaned_labs) == 0:
            return np.nan

        if len(cleaned_labs) == 1:
            return cleaned_labs[0]

        else:
            if agg_type == "first":
                return cleaned_labs[0]
            if agg_type == "median":
                return np.median(cleaned_labs)
            if agg_type == "max":
                return np.max(cleaned_labs)
            if agg_type == "min":
                return np.min(cleaned_labs)

    else:
        cleaned_labs = fast_clean_data(np_labs)

        if agg_type == "first":
            return cleaned_labs[0]
        if agg_type == "median":
            return np.median(cleaned_labs)
        if agg_type == "max":
            return np.max(cleaned_labs)
        if agg_type == "min":
            return np.min(cleaned_labs)


def fast_clean_labs_general_offset(df_labs, lst_ids, offsets, dict_labs_info):
    """
    Input the labs dataframe from the eICU, a list of ids of the target cohort and a dictionary with certain setting per laboratory value.
    Receive laboratory parameters that are cleaned/aggregated per patientunitstayid
    
    :param df_labs: Dataframe -  dataframe containing the labs (labs dataframe from eICU or abbreviated)

    :param lst_ids: List - List with the patientunitstayids

    :param offsets: Tuple, (earlier offset, later offset) - target time period

    :param dict_labs_info: Dictionary - in the format of
        "labname": [(lower_bound, upper_bound), (lower_realistic, upper_realistic), aggregation], where:
        the labname is a the string name of the lab value (contained in the "labname")
        the (lower bound, upper bound) is the range for possible values (eg. for AST 0-1000000)
        the (lower bound, upper bound) is the range for realistic values (eg. for AST 1-10000)
        the aggreation is a string picklist "first", "median", "max", "min"

    :return: Dataframe with each column representing an aggregated laboratory value (and the column "patienunitstayid")

    """
    
    lower_offset, upper_offset = offsets
    
    #reduce the dataframe size to speed up the following calculation/lessen the computational load
    df_lab = df_labs.loc[df_labs.patientunitstayid.isin(lst_ids) &
                         df_labs.labname.isin(list(dict_labs_info.keys())) &
                         (df_labs.labresultoffset >= lower_offset) & (df_labs.labresultoffset <= upper_offset), :].copy()

    lst_individ_lab_clm = []

    for lab_name, bounds in tqdm(dict_labs_info.items()):
        lower_bound, upper_bound = bounds[0]
        aggregation = bounds[2]

        # reduction to only the current labname
        df_temp = df_lab.loc[(df_lab.labname == lab_name) &
                             (df_lab.labresult >= lower_bound) & (df_lab.labresult <= upper_bound), :].copy()

        # sorting before reduction to make an option "first" possible
        df_temp = df_temp.sort_values(by=["patientunitstayid", "labresultoffset"])
        df_temp = df_temp[["patientunitstayid", "labresult"]].copy().dropna()

        df_temp.labresult = df_temp.labresult.astype("float")
        
        # group the values
        df_grouped_lab = (df_temp
                          .groupby(["patientunitstayid"])
                          .agg(lambda x: group_labs_whole_timeframe(x, agg_type=aggregation, realistic_bounds=bounds[1]))
                          .reset_index()
                          .dropna()
                          )

        # make a dataframe with a single column containing the ids and then merge the results to that
        df_pat = pd.DataFrame({'patientunitstayid': lst_ids})

        clm_name = f"{lab_name}_{lower_offset}to{upper_offset}_{aggregation}"
        df_individ_lab_final = df_pat.merge(df_grouped_lab, on="patientunitstayid", how="left").rename(columns={"labresult": clm_name})

        lst_individ_lab_clm.append(df_individ_lab_final)

    lst_dfs_patids_index = [df.set_index("patientunitstayid") for df in lst_individ_lab_clm]

    df_final_all = pd.concat(lst_dfs_patids_index, axis=1)
    df_final_all = df_final_all.reset_index()

    return df_final_all

In [20]:
def fast_clean_data(array_vitals):
    """
    Discard outliers that are > (median + 2 IQR) or < (median - 2 IQR)

    :param lst_values: List - list of values (eg. mean BPs from a certain timeframe of a patient)

    :return: Array - numpy array of cleaned values

    """

    # to detect outliers for the patients own baseline:
    q25, q50, q75 = np.percentile(array_vitals, [25, 50, 75])
    iqr = q75 - q25

    cleaned_array = array_vitals[(array_vitals >= (q50 - 2 * iqr)) & (array_vitals <= (q50 + 2 * iqr))].copy()

    return cleaned_array


def groupby_vitals_per_hour(x, agg_timeunit):
    """
    Discard outliers (apply fast_clean_data function ()) and then summarize the data in this timeframe as a
    single value per patient
    """
    np_hourly_vitals = x.to_numpy()

    cleaned_hourly = fast_clean_data(np_hourly_vitals)

    if agg_timeunit == "median":
        hour_vital = np.median(cleaned_hourly)
    if agg_timeunit == "max":
        hour_vital = np.max(cleaned_hourly)
    if agg_timeunit == "min":
        hour_vital = np.min(cleaned_hourly)

    return hour_vital


def groupby_vitals_total(x, agg_total):
    """
    Summarize the data in this timeframe as a single value per patient
    """
    np_total_vitals = x.to_numpy()

    if agg_total == "median":
        total_vital = np.median(np_total_vitals)
    if agg_total == "max":
        total_vital = np.max(np_total_vitals)
    if agg_total == "min":
        total_vital = np.min(np_total_vitals)

    return total_vital


def fast_vitals_periodic(df_periodic, lst_ids, vital_name, realistic_bounds, offset, agg_timeunit="median",
                         agg_total="median", timeunit=60):
    """
        Takes the vitalPeriodic table of eICU, target patientunitstayids as a list, offset bounds, and realistic bounds and a vitalname and
        returns the aggregated and cleaned value for that offset timeframe. There are different options to choose from on how to aggregate that
        value. If looking at the temperature, there is the option to include all temperature measurements from the nurses charting (which often 
        primarily includes the temperature values of a patient. Pulsepressure is calculated from the respective BP_sys - BP_dia. (will 
        automatically use 20/300 (systolic) and 10/200 (diastolic) as bounds)
        Accepted vitalnames: temperature, sao2, heartrate, respiration, cvp, etco2, systemicsystolic, systemicdiastolic, systemicmean, pasystolic, 
        padiastolic, pamean, icp

        :param df_periodic: Dataframe - abbreviated (! unless a lot of free RAM) Dataframe of the vitalPeriodic table of eICU

        :param lst_ids: List - list of patientunitstayids of the target population

        :param vital_name: String - Column name of the vital 

        :param realistic_bounds: Tuple - realistic values of the vital in the form of (lower bound, upper bound) eg. (20,100) for fio2

        :param offset: Tuple - (lower time offest, upper time offset)

        :param agg_timeunit: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values per timeunit (if the
                offset duration is longer than the timeunit)

        :param agg_total: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values to the final value
        
        :param timeunit: Int, preset 60 - Number of minutes for the timeunit

        :return: Dataframe 
        """

    # reducing the general df to reduce the computational load
    df_reduced = df_periodic[df_periodic["patientunitstayid"].isin(lst_ids)].copy()

    # open the tuples of the bounds 
    lower_realistic_bound, upper_realistic_bound = realistic_bounds
    lower_offset, upper_offset = offset

    # next reduction, incorporating the big offset bounds and the realistic bounds
    df_temp = df_reduced.loc[
        (df_reduced[vital_name] >= lower_realistic_bound) & (df_reduced[vital_name] <= upper_realistic_bound) &
        (df_reduced["observationoffset"] >= lower_offset) & (df_reduced["observationoffset"] <= upper_offset),
        ["patientunitstayid", "observationoffset", vital_name]].copy().dropna()

    # create bins for the timeunit in the given offset interval and then cut the column to these bins
    lst_bins = [i for i in range(lower_offset, upper_offset + 1, timeunit) if i <= upper_offset]
    
    if (upper_offset - lower_offset) % timeunit != 0: 
        lst_bins.append(upper_offset)

    df_temp.observationoffset = pd.cut(df_temp.observationoffset, bins=lst_bins, right=True, include_lowest=True)

    # group first by hour (cleaning the data of outliers) and then over the total offset span
    df_grouped_hours = (df_temp
                        .groupby(["patientunitstayid", "observationoffset"])
                        .agg(lambda x: groupby_vitals_per_hour(x, agg_timeunit=agg_timeunit))
                        .reset_index()
                        .drop(columns=["observationoffset"])
                        .dropna()
                        .groupby(["patientunitstayid"])
                        .agg(lambda x: groupby_vitals_total(x, agg_total=agg_total))
                        .reset_index()
                        )

    # make a dataframe with a single column containing the ids and then merge the results to that
    df_pat = pd.DataFrame({'patientunitstayid': lst_ids})
    clm_name = "{}_{}_{}to{}_u{}".format(vital_name, agg_total, lower_offset, upper_offset, timeunit)
    df_pat_final = df_pat.merge(df_grouped_hours, on="patientunitstayid", how="left").rename(columns={vital_name: clm_name})

    return df_pat_final

In [21]:
def fast_vitals_combined(df_periodic, df_aperiodic, lst_ids, vital_name, realistic_bounds, offset, agg_timeunit="median",
                         agg_total="median", timeunit=60):
    """
    Takes the both the vitalPeriodic and Aperiodic tables of eICU, target patientunitstayids as a list, offset bounds, and realistic bounds and a vitalname and
    returns the aggregated and cleaned value for that offset timeframe. There are different options to choose from on how to aggregate that
    value. Possible vitalnames: systolic, diastolic, mean_bp

    :param df_periodic: Dataframe - vitalPeriodic Dataframe of the eICU Database (abbreviated unless a lot of free RAM)

    :param df_aperiodic: Dataframe - vitalAperiodic Dataframe (abbreviated unless a lot of free RAM)

    :param lst_ids: List - list of patientunitstayids of the target population

    :param vital_name: String, picklist - options "systolic", "diastolic", "mean_bp"

    :param realistic_bounds: Tuple - realistic values of the vital in the form of (lower bound, upper bound) eg. (20, 250) for systolic

    :param offset: Tuple - (lower time offest, time upper offset)

    :param agg_timeunit: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values per timeunit
    (if the offset-duration is longer than the timeunit)

    :param agg_total: String, picklist, preset "median" - whether to use "median", "max" or "min" to aggregate the values to the final value
    
    :param timeunit: Int, preset 60 - Number of minutes for the timeunit

    :return: Dataframe 
    """

    # dictionary that sorts the vital_names to the respective names of the columns in the periodic and Aperiodic dfs
    dict_vitalnames = {
        "systolic": ("systemicsystolic", "noninvasivesystolic"),
        "diastolic": ("systemicdiastolic", "noninvasivediastolic"),
        "mean_bp": ("systemicmean", "noninvasivemean")
    }

    # reducing the general dfs to reduce the computational load
    df_reduced_periodic = df_periodic[df_periodic["patientunitstayid"].isin(lst_ids)].copy()
    df_reduced_aperiodic = df_aperiodic[df_aperiodic["patientunitstayid"].isin(lst_ids)].copy()

    # open the tuples
    lower_realistic_bound, upper_realistic_bound = realistic_bounds
    lower_offset, upper_offset = offset

    key_periodic, key_aperiodic = dict_vitalnames[vital_name]
    df_temp_periodic = df_reduced_periodic.loc[
        (df_reduced_periodic[key_periodic] >= lower_realistic_bound) & (df_reduced_periodic[key_periodic] <= upper_realistic_bound) &
        (df_reduced_periodic["observationoffset"] >= lower_offset) & (df_reduced_periodic["observationoffset"] <= upper_offset),
        ["patientunitstayid", "observationoffset", key_periodic]].copy().dropna().rename(columns={key_periodic: vital_name})
    df_temp_aperiodic = df_reduced_aperiodic.loc[
        (df_reduced_aperiodic[key_aperiodic] >= lower_realistic_bound) & (df_reduced_aperiodic[key_aperiodic] <= upper_realistic_bound) &
        (df_reduced_aperiodic["observationoffset"] >= lower_offset) & (df_reduced_aperiodic["observationoffset"] <= upper_offset),
        ["patientunitstayid", "observationoffset", key_aperiodic]].copy().dropna().rename(columns={key_aperiodic: vital_name})

    df_temp_whole = pd.concat([df_temp_periodic, df_temp_aperiodic])

    # create bins for the timeunit in the given offset interval and then cut the column to these bins
    lst_bins = [i for i in range(lower_offset, upper_offset + 1, timeunit) if i <= upper_offset]
    
    if (upper_offset - lower_offset) % timeunit != 0:
        lst_bins.append(upper_offset)

    df_temp_whole.observationoffset = pd.cut(df_temp_whole.observationoffset, bins=lst_bins, right=True, include_lowest=True)

    # group first by hour (cleaning the data of outliers) and then over the total offset span
    df_grouped_all = (df_temp_whole
                      .groupby(["patientunitstayid", "observationoffset"])
                      .agg(lambda x: groupby_vitals_per_hour(x, agg_timeunit=agg_timeunit))
                      .reset_index()
                      .drop(columns=["observationoffset"])
                      .dropna()
                      .groupby(["patientunitstayid"])
                      .agg(lambda x: groupby_vitals_total(x, agg_total=agg_total))
                      .reset_index()
                      )

    # make a dataframe with a single column containing the ids and then merge the results to that
    df_pat = pd.DataFrame({'patientunitstayid': lst_ids})
    clm_name = "{}_{}_{}to{}_u{}".format(vital_name, agg_total, lower_offset, upper_offset, timeunit)
    df_pat_final = df_pat.merge(df_grouped_all, on="patientunitstayid", how="left").rename(columns={vital_name: clm_name})

    return df_pat_final

In [22]:
def apply_time_to_death_from_unit_admit(x):
    # if the patient did not die, return NaN
    if x.hospitaldischargestatus == "Alive":
        return np.nan

    # if the patient died in the ICU then the time on ICU is the time-to-death
    if x.unitdischargestatus == "Expired":
        return x.actualiculos

    # if the patient died in the hospital calculate the time to death by subtracting the time to ICU admission
    if x.hospitaldischargestatus == "Expired":
        time_to_death = x.unabridgedhosplos - (x.hospitaladmitoffset / (24 * 60))
        return time_to_death

    else:
        return np.nan


def time_to_death_from_unit_admission(df_pat, df_apache_res, lst_ids):
    """
    Compute the time to death from ICU-admission for a cohort of interest. Will return np.nan if the patient did not die and
    for irregularities.

    :param df_pat: Dataframe - patient dataframe of the eICU or abbreviated

    :param df_apache_res: Dataframe - apachePatientResult dataframe of the eICU or abbreviated

    :param lst_ids: List - List of patientunitstayids

    return: Dataframe with 2 columns (patientunitstayids and time_to_death_unitadmit)
    """

    # reducing the general dfs to reduce the computational load
    df_pat_red = df_pat.loc[df_pat.patientunitstayid.isin(lst_ids),
                            ["patientunitstayid", "hospitaldischargestatus", "unitdischargestatus",
                             "hospitaladmitoffset"]].copy()
    df_apache_res_red = df_apache_res.loc[df_apache_res.patientunitstayid.isin(lst_ids),
                                          ["patientunitstayid", 'actualiculos', 'unabridgedhosplos']].copy()

    # merging both to have all these columns ready for the apply method
    df_combined = pd.merge(left=df_pat_red, right=df_apache_res_red, how="left", on="patientunitstayid")
    df_combined = df_combined.drop_duplicates()

    # calculate the time-to-death in a row-wise approach
    df_combined["time_to_death_unitadmit"] = df_combined.apply(lambda x: apply_time_to_death_from_unit_admit(x), axis=1)

    df_final = df_combined[["patientunitstayid", "time_to_death_unitadmit"]].copy()

    return df_final

In [23]:
def map_AKI_crea_rise_by_offset(y, df_ref):
    """
    Mapping function to map over the crea values of one patient whether AKI occurered by the criteria "increase in creatinine"

    :param y: int - index of the df_lab_pat -> is the converted to offset/current crea to circumvent the problem of duplicate entries
    with the same offset

    :param df_ref: Dataframe - reduced labs_pat Dataframe (only 1 pat-id!)

    :return: Int - binary

    """

    offset = df_ref.at[y, "labresultoffset"]
    current_crea = df_ref.at[y, "labresult"]

    # retrograde approach to make it easier later to make the time-to-AKI calculation
    df_timepoint = df_ref.loc[df_ref.labresultoffset.between((offset - 2880), offset), "labresult"] #2880 minutes are 48 hours

    lst_previous_values = df_timepoint.tolist()

    lst_AKI = [x for x in lst_previous_values if x >= (current_crea + 0.3)]

    if len(lst_AKI) >= 1:
        return 1
    else:
        return 0

    
def map_AKI_crea_rise_offsetclm(x, df_ref):
    """
    Mapping function to identify the time-to-AKI by the criterium "increase in creatinine"
    
    :param y: int - patientunitstayid
    
    :param df_ref: Dataframe - reduced labs_pat Dataframe 
    
    :return: String - in the format "offset|offset|...." for all the AKIs identified
    """
    df_lab_pat = df_ref.loc[(df_ref.patientunitstayid == x) & (df_ref.labresultoffset >= 0), :].copy()

    if df_lab_pat.empty == True:
        return np.nan

    df_lab_pat = df_lab_pat.reset_index(drop=True)
    df_ref_one_pat = df_lab_pat.copy()

    df_lab_pat["AKI_crea_rise_one_pat"] = df_lab_pat.index.map(lambda y: map_AKI_crea_rise_by_offset(y, df_ref_one_pat))

    df_lab_AKI_offsets = df_lab_pat.loc[df_lab_pat["AKI_crea_rise_one_pat"] == 1, "labresultoffset"].copy()
    lst_offsets_AKI_crea_rise = df_lab_AKI_offsets.tolist()
    lst_offsets_AKI_crea_rise_sorted = sorted(lst_offsets_AKI_crea_rise)

    string_offsets = "|".join(str(i) for i in lst_offsets_AKI_crea_rise_sorted)

    return string_offsets


def map_AKI_crea_rise_all_pat(x, df_ref):
    """
    Mapping function for all patients whether AKI occurered by the criteria "increase in creatinine"

    :param x: int - patientunitstayid

    :param df_ref: Dataframe - reduced labs Dataframe (but with all pat-ids)

    :return: Int - binary
    """

    df_lab_pat = df_ref.loc[(df_ref.patientunitstayid == x) & (df_ref.labresultoffset >= 0), :].copy()

    if df_lab_pat.empty == True:
        return np.nan

    df_lab_pat = df_lab_pat.reset_index(drop=True)
    df_ref_one_pat = df_lab_pat.copy()

    df_lab_pat["AKI_crea_rise_one_pat"] = df_lab_pat.index.map(lambda y: map_AKI_crea_rise_by_offset(y, df_ref_one_pat))
    lst_values = df_lab_pat.AKI_crea_rise_one_pat.unique().tolist()

    if 1 in lst_values:
        return 1
    else:
        return 0

def map_AKI_diagnosis_offsets(x, df_diagnosis_ref):
    """
    Mapping function to identify the time-to-AKI by the criterium "increase in creatinine"
    
    :param x: int - patientunitstayid
    
    :param df_diagnosis_ref: Dataframe - reduced diagnosis Dataframe 
    
    :return: String - in the format "offset|offset|...." for all the AKIs identified
    """

    df_diagnosis_one_pat = df_diagnosis_ref.loc[df_diagnosis_ref.patientunitstayid == x, "diagnosisoffset"]

    if df_diagnosis_one_pat.empty == True:
        return np.nan

    lst_offsets = df_diagnosis_one_pat.tolist()
    lst_offsets_sorted = sorted(lst_offsets)

    string_offsets = "|".join(str(i) for i in lst_offsets_sorted)

    return string_offsets



def map_AKI_crea_baseline(x, df_ref):
    """
    Mapping function for all patients whether AKI occurered by the criteria "increase in creatinine-baseline"

    :param x: int - patientunitstayid

    :param df_ref: Dataframe - reduced labs Dataframe (but with all pat-ids)

    :return: Int - binary
    """
    
    df_lab_pat = df_ref[df_ref.patientunitstayid == x].copy()

    if df_lab_pat.empty == True:
        return np.nan

    df_lab_pat = df_lab_pat.sort_values(by=["labresultoffset"]).reset_index(drop=True)

    df_labs_before_icu = df_lab_pat[df_lab_pat.labresultoffset <= 0].copy()

    if df_labs_before_icu.empty == False:
        crea_baseline = df_labs_before_icu.labresult.min()

    else:
        crea_baseline = df_lab_pat.at[0, "labresult"]

    df_lab_pat_first_week = df_lab_pat[df_lab_pat.labresultoffset <= 10080]
    lst_creas_first_week = df_lab_pat_first_week.labresult.tolist()

    lst_aki_baseline = [i for i in lst_creas_first_week if i >= (crea_baseline * 1.5)]

    if len(lst_aki_baseline) >= 1:
        return 1
    else:
        return 0


def find_AKI_pat(df_labs, df_diagnosis, lst_ids):
    """
    Take a population (list of patientunitstayids), the labs table (for creatinine values), and the diagnosis table and return whether the patients
    had an AKI event during their hospital stay

    KDIGO:
    Increase in serum creatinine by ≥0.3 mg/dL (≥26.5 micromol/L) within 48 hours 

    Increase in serum creatinine to ≥1.5 times baseline, which is known or presumed to have occurred within the prior seven days 

    Urine volume <0.5 mL/kg/hour for six hours -> do not use -> pretty bad reliability https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8486770/, therefore not implemented

    + “acute renal failure” in the diagnosis chart of the eICU database


    :param df_labs: Dataframe - labs dataframe 

    :param df_diagnosis: Dataframe - diagnosis Dataframe 

    :param lst_ids: list - list of target patientunitstayids

    :return: Dataframe - with columns denoting where the AKI had been found (+ offsets/time to AKI)

    """
    
    # reduce the labs table to only contain the necessary values
    df_labs = df_labs.loc[(df_labs.patientunitstayid.isin(lst_ids)) & (df_labs.labname == "creatinine"),
                          ["patientunitstayid", "labresultoffset", "labresult"]].copy()
    df_labs = df_labs.dropna()
    df_ref_big = df_labs.copy()

    df_pat = pd.DataFrame({"patientunitstayid": lst_ids})

    ## for the acute crea increase
    # whether they got an AKI from that
    df_pat["AKI_crea_rise"] = df_pat["patientunitstayid"].map(lambda x: map_AKI_crea_rise_all_pat(x, df_ref_big))

    # offsets for the acute crea increase
    df_pat["AKI_crea_rise_offsets"] = df_pat["patientunitstayid"].map(lambda x: map_AKI_crea_rise_offsetclm(x, df_ref_big))

    ## for the increase from baseline
    # whether they got an AKI from that
    df_pat["AKI_crea_baseline"] = df_pat["patientunitstayid"].map(lambda x: map_AKI_crea_baseline(x, df_ref_big))

    ## via the diagnosis table
    df_diagnosis = df_diagnosis.loc[(df_diagnosis.patientunitstayid.isin(lst_ids)) &
                                    (df_diagnosis["diagnosisstring"].str.contains("acute renal failure")),
                                    ["patientunitstayid", "diagnosisoffset"]].copy()
    df_diagnosis = df_diagnosis.dropna()

    ## if they ever had an entry to the diagnosis list
    lst_pat_w_AKI_diagnosis_ever = df_diagnosis.patientunitstayid.unique().tolist()
    df_pat["AKI_diagnosis_ever"] = df_pat["patientunitstayid"].map(lambda x: 1 if x in lst_pat_w_AKI_diagnosis_ever else 0)

    ## whether they already had an AKI on admission (<24h) to the ICUin the diagnosis chart (for further differentiation/exclusion)
    df_diagnosis_under1d = df_diagnosis[df_diagnosis.diagnosisoffset <= 1440].copy()
    lst_pat_w_AKI_diagnosis_onadmission = df_diagnosis_under1d.patientunitstayid.unique().tolist()
    df_pat["AKI_diagnosis_onadmission"] = df_pat["patientunitstayid"].map(lambda x: 1 if x in lst_pat_w_AKI_diagnosis_onadmission else 0)

    ## offsets for the AKI by diagnosis chart 
    df_pat["AKI_diagnosis_offsets"] = df_pat["patientunitstayid"].map(lambda x: map_AKI_diagnosis_offsets(x, df_diagnosis))

    return df_pat

In [24]:
def get_dialysis_dialysisdays(df_intout, df_treat, df_pmh, lst_ids):
    """
    Function that checks if patients got dialysis at some point during the stay and several other variables. It will export
    1 column (boolean) called pmh_dialysis if the patient were listed as receiving dialysis under the pmh variable
    renal failure. Then another boolean column if they received any dialysis at any point during their hospital stay
    (any_dialysis_ever). Then a column treatment_chronic_dialysis (boolean) if they were listed as receiving dialysis
    for chronic renal failure in the treatments dataframe. Then a column treatment_peritoneal_dialysis if they were
    listed as receiving peritoneal dialysis at any point in the treatments dataframe.
    And then a column dialysis_days that contains the number of seperate days that the patients received dialysis
    (only full days). If a patient did not receive dialysis then it will contain np.nan

    :param df_intout: Dataframe - IntakeOutput table from eICU
    
    :param df_treat: Dataframe - treatment table from eICU
    
    :param df_pmh: Dataframe - pasthistory table from eICU
    
    :param lst_ids: List - list of patientunitstayids
    
    :return: Dataframe
    """
    
    lst_dialysis_treatments = [
        'renal|dialysis|hemodialysis|for chronic renal failure',
        'renal|dialysis|hemodialysis',
        'renal|dialysis|hemodialysis|for acute renal failure',
        'renal|dialysis|hemodialysis|emergent',
        'renal|dialysis|C V V H D',
        'renal|dialysis|insertion of venous catheter for hemodialysis',
        'renal|dialysis|peritoneal dialysis',
        'renal|dialysis|SLED',
        'renal|dialysis|C V V H',
        'renal|dialysis|ultrafiltration (fluid removal only)',
        'renal|dialysis|peritoneal dialysis|emergent',
        'renal|dialysis|peritoneal dialysis|for chronic renal failure',
        'renal|electrolyte correction|treatment of hyperkalemia|dialysis',
        'toxicology|drug overdose|drug removal measures|hemodialysis',
        'renal|dialysis|ultrafiltration (fluid removal only)|for acute renal failure',
        'renal|electrolyte correction|treatment of hyperphosphatemia|dialysis'
    ]  # for the column treatmentstring in df_treatment

    lst_peritoneal_dialysis_treatments = [
        'renal|dialysis|peritoneal dialysis',
        'renal|dialysis|peritoneal dialysis|emergent',
        'renal|dialysis|peritoneal dialysis|for chronic renal failure',
    ]  # for the column treatmentstring in df_treatment

    lst_chronic_dialysis_treatments = [
        'renal|dialysis|hemodialysis|for chronic renal failure',
        'renal|dialysis|peritoneal dialysis|for chronic renal failure',
    ]  # for the column treatmentstring in df_treatment

    lst_dialysis_intout = [
        'flowsheet|Flowsheet Cell Labels|I&O|Dialysis (ml)|In',
        'flowsheet|Flowsheet Cell Labels|I&O|Dialysis (ml)|Out',
        'flowsheet|Flowsheet Cell Labels|I&O|Output (ml)|CRRT - UF removed',
        'flowsheet|Flowsheet Cell Labels|I&O|Intake (ml)|Generic Intake (ml)|Hemodialysis Intake (mL)',
        'flowsheet|Flowsheet Cell Labels|I&O|Output (ml)|CRRT Actual Pt Fluid Removed ("C")',
        'flowsheet|Flowsheet Cell Labels|I&O|Output (ml)|Ultrafiltration',
        'flowsheet|Flowsheet Cell Labels|I&O|Output (ml)|CRRT Out',
        'flowsheet|Flowsheet Cell Labels|I&O|Output (ml)|HemodialysisOut'
    ]  # for the column cellpath in df_intakeOutput

    # reduce all dfs
    df_pmh_red = df_pmh[df_pmh["patientunitstayid"].isin(lst_ids)].copy()
    df_treat_red = df_treat[df_treat["patientunitstayid"].isin(lst_ids)].copy()
    df_intout_red = df_intout[df_intout["patientunitstayid"].isin(lst_ids)].copy()

    # get whether patients had a history of renal failure with dialysis
    df_pmh_result = get_pastHistory(df_pmh_red, lst_ids)
    df_pmh_result["pmh_dialysis"] = df_pmh_result.pmh_renal_failure.map(lambda x: 1 if "dialysis" in str(x) else 0)

    # initialize the then final dataframe
    df_final_res = df_pmh_result[["patientunitstayid", "pmh_dialysis"]].copy()

    # calculate whether patients got dialysis at all
    df_treat_dialysis = df_treat_red[df_treat_red.treatmentstring.isin(lst_dialysis_treatments)].copy()
    lst_treat_dialysis = list(df_treat_dialysis.patientunitstayid.unique())

    df_intout_dialysis = df_intout_red[df_intout_red.cellpath.isin(lst_dialysis_intout)].copy()
    lst_intout_dialysis = list(df_intout_dialysis.patientunitstayid.unique())

    lst_patids_dialysis = list(set(lst_treat_dialysis + lst_intout_dialysis))
    df_final_res["any_dialysis_ever"] = df_final_res.patientunitstayid.map(lambda x: 1 if x in lst_patids_dialysis else 0)

    # calculate who was charted as chronic renal failure in the treatments df
    lst_patids_chronic_treat = list(df_treat_red[df_treat_red.treatmentstring.isin(lst_chronic_dialysis_treatments)].patientunitstayid.unique())
    df_final_res["treatment_chronic_dialysis"] = df_final_res.patientunitstayid.map(lambda x: 1 if x in lst_patids_chronic_treat else 0)

    # add peritoneal dialysis in the treatment df as an extra column
    lst_patids_peritoneal_treat = list(df_treat_red[df_treat_red.treatmentstring.isin(lst_peritoneal_dialysis_treatments)].patientunitstayid.unique())
    df_final_res["treatment_peritoneal_dialysis"] = df_final_res.patientunitstayid.map(lambda x: 1 if x in lst_patids_peritoneal_treat else 0)

    # calculate how many days the patients actually got dialysis
    df_treat_days = df_treat_dialysis.loc[df_treat_dialysis.treatmentoffset >= 0, ["patientunitstayid", "treatmentoffset"]].copy()
    df_treat_days = df_treat_days.rename(columns={"treatmentoffset": "offset"})

    df_intout_days = df_intout_dialysis.loc[df_intout_dialysis.intakeoutputoffset >= 0, ["patientunitstayid", "intakeoutputoffset"]].copy()
    df_intout_days = df_intout_days.rename(columns={"intakeoutputoffset": "offset"})

    df_days = pd.concat([df_treat_days, df_intout_days], axis=0)

    df_days.offset = df_days.offset // 1440

    df_days_grouped = df_days.groupby('patientunitstayid')['offset'].nunique().reset_index()
    df_days_grouped = df_days_grouped.rename(columns={"offset": "dialysis_days"})

    # merge the dialysis days column to the final dataframe
    df_final_res = df_final_res.merge(right=df_days_grouped, how="left", on="patientunitstayid")

    return df_final_res

## Identifying the patients on DOAC

In [25]:
df_meds_admission_pneumonia = pd.read_csv("data_pneumonia_doac/admissionDrug_pn_doac.csv", low_memory=False) 
df_active_medication = pd.read_csv("data_pneumonia_doac/medication_pn_doac.csv", low_memory=False)

In [26]:
# the lists/dictionaries of the DOAC brand and generic names
lst_doacs = [
    "dabigatran", "pradaxa", 
    "rivaroxaban", "xarelto",
    "apixaban", "eliquis",
    "edoxaban", "savaysa", "lixiana",
    "betrixaban", "bevyxxa"
]

dict_doacs = {
    "pradaxa": "dabigatran", 
    "xarelto": "rivaroxaban",
    "eliquis": "apixaban",
    "savaysa": "edoxaban",
    "lixiana": "edoxaban",
    "bevyxxa": "betrixaban",
    "dabigatran" : "dabigatran", 
    "rivaroxaban" : "rivaroxaban",
    "apixaban" : "apixaban",
    "edoxaban" : "edoxaban",
    "betrixaban" : "betrixaban"
}

lst_doacs_regex = "|".join(lst_doacs)

In [27]:
def map_doac_generic_name(x, dict_doacs):
    """Mapping the brand names of the DOAC to their generic names"""
    
    brand_str = x.strip().lower()
    lst_generic_str = brand_str.split()
    lst_doac_brand_name = [i for i in dict_doacs.keys() if i in lst_generic_str]
    
    if len(lst_doac_brand_name) >0 :
        return dict_doacs[lst_doac_brand_name[0]]
    else:
        return np.nan


In [28]:
# Filtering for patients who were reported to receive DOAC prior to their admission to the ICU (reported before 48h in the ICU)
df_doacs_admissiondrug_pneumonia = df_meds_admission_pneumonia[(df_meds_admission_pneumonia.drugname.str.contains(lst_doacs_regex, regex=True, case=False)) 
                                                               & (df_meds_admission_pneumonia.drugoffset < (48*60))].copy()
df_doacs_admissiondrug_pneumonia["generic_name"] = df_doacs_admissiondrug_pneumonia.drugname.map(lambda x: map_doac_generic_name(x, dict_doacs))


In [29]:
lst_doac_admissiondrug_patstays = list(df_doacs_admissiondrug_pneumonia.patientunitstayid.unique())
len(lst_doac_admissiondrug_patstays)

125

125 Patients on DOACs identified from the table admission-drugs

In [30]:
# Extract the HICL-Seq No associated with the the DOACs identified in the admissionDrug table for later use and save them as list/dictionary
df_hicl = df_doacs_admissiondrug_pneumonia[["drughiclseqno", "generic_name"]].value_counts().reset_index()
dict_hicl_doac = dict(zip(df_hicl.drughiclseqno,df_hicl.generic_name))
lst_doac_hicl = list(df_hicl.drughiclseqno.unique())
dict_hicl_doac

{35915: 'rivaroxaban', 37792: 'apixaban', 35604: 'dabigatran'}

In [31]:
# prepare a dataframe with 2 columns patientunitstayid and generic name of the specific DOAC to get/preserve the information which patient received which specific DOAC 
# (recorded on the admission drug table)
df_doac_admdrug_for_merge = df_doacs_admissiondrug_pneumonia.groupby("patientunitstayid")["generic_name"].first().reset_index().copy()

In [32]:
# Filtering for patients who were reported to receive DOAC on admission to the ICU (during the first 48h in the ICU)
df_active_medication_doacs = df_active_medication.loc[(df_active_medication.drughiclseqno.isin(lst_doac_hicl) | 
                                                      df_active_medication.drugname.str.contains(lst_doacs_regex, regex=True, case=False)) & 
                                                      (df_active_medication.drugorderoffset < (60*48)), :].copy()

In [33]:
lst_doac_active_meds_patstays = list(df_active_medication_doacs.patientunitstayid.unique())
len(lst_doac_active_meds_patstays)

186

186 Patients on DOACs identified from the table active medications

In [34]:
# prepare a dataframe with 2 columns patientunitstayid and generic name of the specific DOAC to get/preserve the information which patient received which specific DOAC 
# (recorded on the active medication table)
df_active_medication_doacs["drughiclseqno"] = df_active_medication_doacs["drughiclseqno"].astype(int)
df_active_medication_doacs["generic_name"] = df_active_medication_doacs["drughiclseqno"].map(dict_hicl_doac)

df_doac_active_meds_for_merge = df_active_medication_doacs.groupby("patientunitstayid")["generic_name"].first().reset_index().copy()

In [35]:
# combine both lists of IDs with DOACs and delete duplicates
lst_all_doac_patids = list(set(lst_doac_admissiondrug_patstays) | set(lst_doac_active_meds_patstays))
len(lst_all_doac_patids)

273

In total 273 Patients on DOACs identified (who also had CAP)

In [36]:
# combine the information on the individual DOACs taken by the patients from both the admission drug and active medications table
df_individ_doac = (pd.concat([df_doac_active_meds_for_merge, df_doac_admdrug_for_merge])).drop_duplicates()
len(df_individ_doac.patientunitstayid.unique())

273

Same number of unique IDs after dropping the duplicates -> no conflicting information

In [37]:
# add the information on the DOAC to the growing Dataframe of the extracted data (both binary and the specific type of DOAC)
df_info["doac_binary"] = 0
df_info.loc[df_info.patientunitstayid.isin(lst_all_doac_patids), "doac_binary"] = 1

df_info = df_info.merge(right=df_individ_doac, how="left", on="patientunitstayid").rename(columns={"generic_name":"specific_doac"})


In [38]:
# numbers of ICU-admission with the specific DOACs
df_info.specific_doac.value_counts()

rivaroxaban    160
apixaban        74
dabigatran      39
Name: specific_doac, dtype: int64

## Basic information

In [39]:
df_patient_doac = pd.read_csv("data_pneumonia_doac/patient_pn_doac.csv", low_memory=False)

In [40]:
df_pat_res = get_basic_patient_info(df_patient_doac, lst_pat_pneumonia)

In [41]:
# Process the result 
df_pat_res.hospitaldischargestatus = df_pat_res.hospitaldischargestatus.map({"Expired": 1, "Alive": 0})
df_pat_res.unitdischargestatus = df_pat_res.unitdischargestatus.map({"Expired": 1, "Alive": 0})
df_pat_res.gender = df_pat_res.gender.map({"Male":1, "Female":0})

In [42]:
# Merge the results dataframe to the growing final dataframe
df_info = df_info.merge(right=df_pat_res, how="left", on="patientunitstayid")

## Comorbidities


In [43]:
df_pmh_doac = pd.read_csv("data_pneumonia_doac/pastHistory_pn_doac.csv", low_memory=False)

In [44]:
df_pmh_result = get_pastHistory(df_pmh_doac, lst_pat_pneumonia)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:24<00:00,  1.25s/it]


In [45]:
# Processing the PMH variables (aggregating/binarizing)
df_pmh_result['pmh_PE_binary'] = df_pmh_result.pmh_PE.map(lambda x: 0 if (x == 0 or x == "0") else 1).astype('int64')
df_pmh_result['pmh_venous_thrombosis_binary'] = df_pmh_result.pmh_venous_thrombosis.map(lambda x: 0 if (x == 0 or x == "0") else 1).astype('int64')
df_pmh_result['pmh_strokes_binary'] = df_pmh_result.pmh_strokes.map(lambda x: 0 if (x == 0 or x == "0") else 1).astype('int64')
df_pmh_result['pmh_mmunosuppression_last_6m_binary'] = df_pmh_result.pmh_mmunosuppression_last_6m.map(lambda x: 0 if (x == 0 or x == "0") else 1).astype('int64')
df_pmh_result['pmh_CHF_binary'] = df_pmh_result.pmh_CHF.map(lambda x: 0 if (x == 0 or x == "0") else 1).astype('int64')
df_pmh_result['pmh_card_valvular_binary'] = df_pmh_result.pmh_card_valvular.map(lambda x: 0 if (x == 0 or x == "0") else 1).astype('int64')
df_pmh_result["pmh_afib"] = df_pmh_result.pmh_arrhythmias.map(lambda x: 1 if "atrial fibrillation" in str(x) else 0).astype('int64')
df_pmh_result["pmh_HT_binary"] = df_pmh_result.pmh_HT_with_treatment.map(lambda x: 1 if x!=0 else 0).astype('int64')
df_pmh_result["pmh_coronary_artery_disease"] = df_pmh_result.apply(lambda x: 1 if (x.pmh_MI!=0 or 
                                                                                   x.pmh_PCI!=0 or 
                                                                                   x.pmh_CA_bypass!=0 or
                                                                                   x.pmh_angina!=0) else 0, axis=1).astype('int64')
df_pmh_result['pmh_home_o2_binary'] = df_pmh_result.pmh_home_o2.map(lambda x: 0 if (x == 0 or x == "0") else 1).astype('int64')
df_pmh_result["pmh_copd_binary"] = df_pmh_result.pmh_COPD.map(lambda x: 1 if x!=0 else 0).astype('int64')
df_pmh_result["pmh_asthma_binary"] = df_pmh_result.pmh_asthma.map(lambda x: 1 if x!=0 else 0).astype('int64')
df_pmh_result["pmh_obstructive_LD"] = df_pmh_result.apply(lambda x: 1 if (x.pmh_copd_binary!=0 or x.pmh_asthma_binary!=0) else 0, axis=1).astype('int64')
df_pmh_result['pmh_cancer_binary'] = df_pmh_result.pmh_cancer.map(lambda x: 0 if (x == 0 or x == "0") else 1).astype('int64')
df_pmh_result["pmh_diabetes_binary"] = df_pmh_result.apply(lambda x: 1 if (x.pmh_non_insulin_dep_DM!=0 or x.pmh_insulin_dep_DM!=0) else 0, axis=1).astype('int64')
df_pmh_result["exclude_pmh_dialysis"] = df_pmh_result.pmh_renal_failure.map(lambda x: 1 if any(s in str(x) for s in ["hemodialysis", "peritoneal dialysis"]) else 0).astype('int64')
df_pmh_result["pmh_prior_renal_insufficiency"] = df_pmh_result.apply(lambda x: 1 if (x.pmh_renal_failure=="renal failure- not currently dialyzed" or x.pmh_renal_insuff!=0) else 0, axis=1).astype('int64')

In [46]:
# Merge the results dataframe to the growing final dataframe
df_info = df_info.merge(right=df_pmh_result, how="left", on="patientunitstayid")

## APACHE variables and outcomes

In [47]:
df_apache_res_doac = pd.read_csv("data_pneumonia_doac/apachePatientResult_pn_doac.csv", low_memory=False)
df_apache_aps_doac = pd.read_csv("data_pneumonia_doac/apacheApsVar_pn_doac.csv", low_memory=False)
df_apache_pred_doac = pd.read_csv("data_pneumonia_doac/apachePredVar_pn_doac.csv", low_memory=False)

In [48]:
df_apache_res_res = get_cleaned_apachePatientResult_basics(df_apache_res_doac, lst_pat_pneumonia)
df_apache_aps_res = get_and_initial_clean_apacheApsVar(df_apache_aps_doac, lst_pat_pneumonia)
df_apache_pred_res = get_cleaned_apachePredVar_basics(df_apache_pred_doac, lst_pat_pneumonia)

In [49]:
# Merge the results dataframes to the growing final dataframe
df_info = df_info.merge(right=df_apache_res_res, how="left", on="patientunitstayid")
df_info = df_info.merge(right=df_apache_aps_res, how="left", on="patientunitstayid")
df_info = df_info.merge(right=df_apache_pred_res, how="left", on="patientunitstayid")

In [50]:
# ventilation days for patients that survived
df_info["vent_days_alive"] = df_info.apply(lambda x: x.unabridgedactualventdays if x.hospitaldischargestatus == 0 else np.nan, axis=1)

## BUN

In [51]:
df_labs = pd.read_csv("data_pneumonia_doac/lab_pn_doac.csv", low_memory=False)

In [52]:
# the value lists contain the possible borders as the first tuple, then "realistic" borders as the second tuple and then the aggregation method as a string -> worst value on admission
dict_labs = {"BUN": [(0, 1000), (1, 200), "max"]}

df_labs_results = fast_clean_labs_general_offset(df_labs=df_labs, 
                                                        lst_ids=lst_pat_pneumonia, 
                                                        offsets=(-1440, 1440), 
                                                        dict_labs_info=dict_labs)


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.70it/s]


In [53]:
# Merge the results dataframe to the growing final dataframe
df_info = df_info.merge(right=df_labs_results, how="left", on="patientunitstayid")

## Vital signs

In [54]:
df_periodic = pd.read_csv("data_pneumonia_doac/vitalPeriodic_pn_doac.csv", low_memory=False)
df_a_periodic = pd.read_csv("data_pneumonia_doac/vitalAperiodic_pn_doac.csv", low_memory=False)

In [55]:
# dictionaries of extreme vitals that were excluded as outliers and the aggreagation method
dict_vitals_periodic = {
    "heartrate": [(20, 220), "max"],
    "respiration": [(3, 80), "max"],
    "sao2": [(60, 100), "min"]
}

dict_vitals_combined = {
    "systolic": [(20, 250), "min"],
    "diastolic": [(5, 180), "min"],
}

lst_clms_doac_vitals = []

# for the "periodic" vitals
for key, values in tqdm(dict_vitals_periodic.items()):
    realistic = values[0]
    direction = values[1]
    
    df_temp = fast_vitals_periodic(df_periodic=df_periodic, 
                                       lst_ids=lst_pat_pneumonia, 
                                       vital_name=key, 
                                       realistic_bounds=realistic, 
                                       offset=(0, 1440), 
                                       agg_total=direction, 
                                       timeunit=30)

    lst_clms_doac_vitals.append(df_temp)

# for vital "systolic" and "diastolic" that uses both the periodic and aperiodic datafiles
for key, values in tqdm(dict_vitals_combined.items()):
    realistic = values[0]
    direction = values[1]

    df_temp = fast_vitals_combined(df_periodic=df_periodic, 
                                   df_aperiodic=df_a_periodic,
                                   lst_ids=lst_pat_pneumonia, 
                                   vital_name=key, 
                                   realistic_bounds=realistic, 
                                   offset=(0, 1440), 
                                   agg_total=direction, 
                                   timeunit=30)

    lst_clms_doac_vitals.append(df_temp)

lst_clms_doac_vitals_indexed = [df.set_index("patientunitstayid") for df in lst_clms_doac_vitals]
df_final_vitals_doac = pd.concat(lst_clms_doac_vitals_indexed, axis=1)

df_final_vitals_doac = df_final_vitals_doac.reset_index()

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:37<00:00, 72.38s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:59<00:00, 59.71s/it]


In [56]:
# Merge the results dataframe to the growing final dataframe
df_info = df_info.merge(right=df_final_vitals_doac, how="left", on="patientunitstayid")

## ICU-free-days

In [57]:
df_icu_free_days = get_icu_free_days(df_patient_doac, lst_pat_pneumonia)

In [58]:
# Merge the results dataframe to the growing final dataframe
df_info = df_info.merge(right=df_icu_free_days, how="left", on="patientunitstayid")

## Vasopressor/Inotrope infusions

In [59]:
df_infusion_doac = pd.read_csv("data_pneumonia_doac/infusionDrug_pn_doac.csv", low_memory=False)

In [60]:
df_vaso_ino_d1 = get_infusion_drugs(df_infusion_doac, lst_pat_pneumonia)
df_vaso_ino_d1 = df_vaso_ino_d1.reset_index()

In [61]:
# Merge the results dataframe to the growing final dataframe
df_info = df_info.merge(right=df_vaso_ino_d1, how="left", on="patientunitstayid")

## Time-to-death

In [62]:
df_time_to_death = time_to_death_from_unit_admission(df_pat=df_patient_doac, df_apache_res=df_apache_res_doac, lst_ids=lst_pat_pneumonia)

In [63]:
# Merge the results dataframes to the growing final dataframe
df_info = df_info.merge(
    right=df_time_to_death,
    how="left",
    on="patientunitstayid"
)

## AKI

In [64]:
df_diagnosis_doac = pd.read_csv("data_pneumonia_doac/diagnosis_pn_doac.csv", low_memory=False)

In [65]:
df_aki_result = find_AKI_pat(df_labs, df_diagnosis_doac, lst_pat_pneumonia)
df_aki_result = df_aki_result.replace('', np.nan)

In [66]:
# processing to see who should had an AKI on admission/on their first day and who developed AKI during the ICU-/hospitalstay
def map_aki_crea_rise_firstday(x):
    """Function that computes whether patients with an AKI based on a rise in their creatinine levels had
    that AKI in the first 24h (1440 minutes) after admission to the ICU"""
    if pd.isna(x):
        return np.nan
    else:
        lst_offsets = x.split("|")

        lst_firstday_offsets = [int(i) for i in lst_offsets if int(i) <= 1440]

        if len(lst_firstday_offsets) >= 1:
            return 1
        else:
            return 0


# determine who had AKI defined by a rise in creatinine on the first day on ICU
df_aki_result["crea_rise_first_day"] = df_aki_result["AKI_crea_rise_offsets"].map(lambda x: map_aki_crea_rise_firstday(x))

# create a column that denotes whether patients got an AKI on the first day or were already diagnosed with AKI on admission
df_aki_result["fist_day_or_admission_aki"] = df_aki_result.apply(lambda x: 1 if (x.AKI_diagnosis_onadmission == 1 or x.crea_rise_first_day == 1) else 0, axis=1)

# create a column that identifies patients that did not have any information regarding AKI (no creatinine levels)
df_aki_result["NA_in_crea_rise_and_baseline"] = 0
df_aki_result.loc[(df_aki_result.AKI_crea_rise.isna()) & (df_aki_result.AKI_crea_baseline.isna()), "NA_in_crea_rise_and_baseline"] = 1

# make the final AKI column whether patients had an AKI
df_aki_result["aki_wo_firstday_or_admission"] = 0 # set everyone to 0
df_aki_result.loc[(df_aki_result.AKI_crea_rise == 1) | (df_aki_result.AKI_crea_baseline == 1) | (df_aki_result.AKI_diagnosis_ever == 1),
                   "aki_wo_firstday_or_admission"] = 1 # set everyone with any type of AKI to 1
df_aki_result.loc[(df_aki_result.NA_in_crea_rise_and_baseline == 1) | (df_aki_result.fist_day_or_admission_aki == 1), 
                  "aki_wo_firstday_or_admission"] = np.nan  # set anyone that had an AKI on the first day or the admission, or who did not have any data to np.nan


In [67]:
# Merge the results dataframes to the growing final dataframe
df_info = df_info.merge(right=df_aki_result, how="left", on="patientunitstayid")

## Intubated on admission

In [68]:
def apply_intub_in_hosp_or_ICU(x):
    """Function that determines whether patients were intubated during the first 24h on admission"""
    
    if pd.isna(x.pred_oobintubday1):
        return np.nan
    if x.pred_oobintubday1 == 0:
        return 0
    if x.pred_oobintubday1 == 1:
        if x.unitadmitsource in ['Emergency Department', 'Floor', 'Direct Admit', 'Step-Down Unit (SDU)', 'Acute Care/Floor', 'Recovery Room', 'PACU']:
            return 1 
        else:
            return -1 # patients that were intubated at some time during the first 24h but, based on their unitadmitsource (e.g. Other hospital, other ICU etc.), 
                      # were intubated before reaching this unit/hospital

df_info["fresh_intub_on_admission"] = df_info.apply(lambda x: apply_intub_in_hosp_or_ICU(x), axis=1)

## RRT

In [69]:
df_treatment_doac = pd.read_csv("data_pneumonia_doac/treatment_pn_doac.csv", low_memory=False)
df_intout_doac = pd.read_csv("data_pneumonia_doac/intakeOutput_pn_doac.csv", low_memory=False)

In [70]:
df_dialysis_res = get_dialysis_dialysisdays(df_intout=df_intout_doac, df_treat=df_treatment_doac, df_pmh=df_pmh_doac, lst_ids=lst_pat_pneumonia)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:24<00:00,  1.24s/it]


In [71]:
def apply_only_new_dialysis(x):
    """Function that determines whether patients started new RRT on ICU or were on chronic dialysis"""
    if x.any_dialysis_ever == 0:
        return 0
    if x.any_dialysis_ever == 1:
        if x.pmh_dialysis == 0 and x.treatment_chronic_dialysis == 0 and x.treatment_peritoneal_dialysis == 0:
            return 1
        else:
            return -1
    else:
        return np.nan

df_dialysis_res["new_acute_dialysis"] = df_dialysis_res.apply(lambda x: apply_only_new_dialysis(x), axis=1)

In [72]:
df_info = df_info.merge(right=df_dialysis_res, how="left", on="patientunitstayid")

# Patient exclusion

In [73]:
len(df_info)

11317

In [74]:
# Exclude any non-adult patients
df_info_excl_age = df_info[df_info.age >= 18].copy()

len(df_info_excl_age)

11305

In [75]:
# Exclude any patients who were intubated before arriving at this hospital/ICU and who had a history of dialysis
df_info_excl_outcomes = df_info_excl_age.loc[(df_info_excl_age.fresh_intub_on_admission != (-1)) & 
                                               (df_info_excl_age.exclude_pmh_dialysis != 1) &
                                               (df_info_excl_age.treatment_chronic_dialysis != 1) &
                                               (df_info_excl_age.treatment_peritoneal_dialysis != 1), :].copy()

len(df_info_excl_outcomes)

10765

In [76]:
# Exclude any patients with missing data in the survival outcomes and the propensity score matching variables (PSM requires complete data)
clms_na_drop = [
    'hospitaldischargestatus',
    'unitdischargestatus',
    'apacheadmissiondx',
    'gender',
    'pmh_coronary_artery_disease',
    'pmh_CHF_binary',
    'pmh_afib',
    'pmh_obstructive_LD',
    'pmh_home_o2_binary',
    'pmh_diabetes_binary',
    'pmh_prior_renal_insufficiency',
    'pmh_cancer_binary',
    'pmh_HT_binary',
    'pmh_card_valvular_binary',
    'pmh_PE_binary',
    'pmh_venous_thrombosis_binary',
    'pmh_strokes_binary',
    'pmh_mmunosuppression_last_6m_binary',
    'age',
    'apachescore',
    'BUN_-1440to1440_max',
    'respiration_max_0to1440_u30',
    'systolic_min_0to1440_u30',
    'heartrate_max_0to1440_u30',
    'sao2_min_0to1440_u30',
    'diastolic_min_0to1440_u30',
    'aps_GCS' 
]

df_info_excl_missingdata = df_info_excl_outcomes.dropna(subset=clms_na_drop)

len(df_info_excl_missingdata)

8118

# Data export

In [77]:
df_info_excl_missingdata.to_csv("data_pneumonia_doac/doac_data_extraction_final.csv", index=False)