In [93]:
# import libraries
import os
import pandas as pd
import glob
import warnings

# define file paths
year = 2024
DATA_PATH = f"./presumed_infection/{year}/"

# assumption: each lab data has ./csv folder (or ./csv_file for bili_plt), bili_plt split into bili and plt

In [94]:
def merge_demographic(year): # merge demographic data for a specific year
    OUTPUT_PATH = DATA_PATH + "demographic/"
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    pattern = "*_presumedInfection_Demographic_"
    print(f"Processing year: {year}")
    files = glob.glob(OUTPUT_PATH + f"{pattern}{year}*.csv")
    if not files:
        print(f"No files found for year {year} with pattern {pattern}")
    print(f"Found {len(files)} files for year {year}")
    dfs = []
    for file in files:
        try:
            df = pd.read_csv(file)
            if 'Year' not in df.columns: # add year column for identification
                df['Year'] = year

            if '28_day_mortality' not in df.columns: # calculate 28-day mortality
                df['Admission_Date'] = pd.to_datetime(df['Admission Date (yyyy-mm-dd)'], errors='coerce')
                df['Death_Date'] = pd.to_datetime(df['Date of Registered Death'], errors='coerce')
                mask = df['Admission_Date'].notna() & df['Death_Date'].notna()
                df.loc[mask, 'Days_To_Death'] = (df.loc[mask, 'Death_Date'] - df.loc[mask, 'Admission_Date']).dt.days
                df['28_day_mortality'] = (df['Days_To_Death'] <= 28) & (df['Days_To_Death'] >= 0)
                df['28_day_mortality'] = df['28_day_mortality'].fillna(False)
                df = df.drop(['Admission_Date', 'Death_Date', 'Days_To_Death'], axis=1, errors='ignore')

            if '7_day_mortality' not in df.columns: # calculate 7-day mortality
                df['Admission_Date'] = pd.to_datetime(df['Admission Date (yyyy-mm-dd)'], errors='coerce')
                df['Death_Date'] = pd.to_datetime(df['Date of Registered Death'], errors='coerce')
                mask = df['Admission_Date'].notna() & df['Death_Date'].notna()
                df.loc[mask, 'Days_To_Death'] = (df.loc[mask, 'Death_Date'] - df.loc[mask, 'Admission_Date']).dt.days
                df['7_day_mortality'] = (df['Days_To_Death'] <= 7) & (df['Days_To_Death'] >= 0)
                df['7_day_mortality'] = df['7_day_mortality'].fillna(False)
                df = df.drop(['Admission_Date', 'Death_Date', 'Days_To_Death'], axis=1, errors='ignore')
                
            dfs.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")
            continue
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        output_file = os.path.join(OUTPUT_PATH, f"demographic_{year}.csv")
        combined_df.to_csv(output_file, index=False)
        print(f"Saved combined data for year {year} to {output_file}, {len(combined_df)} total rows")

In [95]:
def merge_data(data_type, year): # merge data for a specific year (inotrope, procedure, creatinine, bili, plt, lact)
    if data_type == "inotrope" or data_type == "procedure":
        OUTPUT_PATH = DATA_PATH + f"{data_type}/csv/"
        pattern = "*_presumedInfection_" + data_type.title() + "_"
    elif data_type == "creatinine": # alternative filename handling for creatinine
        OUTPUT_PATH = DATA_PATH + f"{data_type}/csv/"
        pattern = "*_presumedInfection_Crea_"
    elif data_type == "lact": # alternative filename handling for lact
        OUTPUT_PATH = DATA_PATH + "Lact/csv/"
        pattern = "*_presumedInfection_lactate_"
    else: # alternative filename handling for bili/plt
        OUTPUT_PATH = DATA_PATH + f"Bili_Plt/csv_file/{data_type}/"
        pattern = "*_presumedInfection_Bili_Plt_"

    print(f"Processing year: {year}")
    files = glob.glob(OUTPUT_PATH + f"{pattern}{year}*.csv")
    if not files:
        print(f"No files found for year {year} with pattern {pattern}")
    print(f"Found {len(files)} files for year {year}")
    dfs = []
    for file in files:
        try:
            df = pd.read_csv(file)
            dfs.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")
            continue
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        output_file = os.path.join(OUTPUT_PATH, f"{data_type}_{year}.csv")
        combined_df.to_csv(output_file, index=False)
        print(f"Saved combined data for year {year} to {output_file}, {len(combined_df)} total rows")

In [96]:
def patient_date_deadline(Ref_Key, date_df): # Ref Key is numeric not string
    df = date_df[date_df['Reference Key'] == Ref_Key].copy()
    if 'Admission Date (yyyy-mm-dd)' in df.columns and not df.empty:
        admission_date = pd.to_datetime(df.iloc[0]['Admission Date (yyyy-mm-dd)'], errors='coerce')
        deadline_date = admission_date + pd.Timedelta(days=2) # 2 days after admission (deadline)
        return deadline_date.strftime('%Y-%m-%d')
    return None

In [97]:
def check_vasopressor(Ref_Key, df, deadline): # check if new vasopressor is administered
    # noradrenaline is norepinephrine, adrenaline is epinephrine, vasopressin missing
    vasopressor_list = ["NORADRENALINE (BITARTRATE)", "DOPAMINE HCL", "ADRENALINE (ACID TARTRATE)", "PHENYLEPHRINE HCL", "VASOPRESSIN"] # contains the string
    df = df[df['Reference Key'] == Ref_Key].copy()
    if 'Drug Name' in df.columns and 'Prescription Start Date' in df.columns and not df.empty:
        if deadline:
            deadline_date = pd.to_datetime(deadline)
            df['Prescription Start Date'] = pd.to_datetime(df['Prescription Start Date'], errors='coerce')
            for drug in vasopressor_list:
                if df['Drug Name'].str.contains(drug, case=True, na=False, regex=False).any():
                    mask = df['Drug Name'].str.contains(drug, case=True, na=False, regex=False) & (df['Prescription Start Date'] <= deadline_date)
                    if mask.any():
                        return True
    # check: cannot be administered >2 days before blood culture date
    return False

In [None]:
def check_ventilation(Ref_Key, df, deadline): # check if mechanical ventilation is initiated
    procedure_list = ["Invasive mechanical ventilation (96.70:0)", "Non-invasive mechan ventilation (93.99:2)"] # matches, have other names? Intermitt +ve pressure ventilation (93.91:2) counts?
    df = df[df['Reference Key'] == Ref_Key].copy()
    # deadline = patient_date_deadline(Ref_Key, year) # not used
    for procedure in procedure_list:
        for i in range(1, 16):
            if df[f'Procedure HAMDCT Description (rank {i})'] == procedure:
                return True
    # check: >1 day between ventilation episodes
    return False

In [None]:
def filter_renal(): # filter end-stage renal disease patients
    DXPX_PATH = DATA_PATH + f"demographic/demographic_{year}.csv"
    df = pd.read_csv(DXPX_PATH)
    disease_list = []
    if 'Reference Key' in df.columns and 'Dx/Px Description (HAMDCT)' in df.columns:
        df['renal'] = df['Dx/Px Description (HAMDCT)'].str.lower().str.contains('|'.join(disease_list), case=False, na=False) 
        result_df = df[['Reference Key', 'renal']].drop_duplicates()       
        return result_df
    else:
        print("Required columns not found")
        return pd.DataFrame(columns=['Reference Key', 'renal'])
    
# only keeps first occurence(?)

In [None]:
def check_creatinine(Ref_Key, df, deadline): # check if creatinine level is abnormal
    renal_df = filter_renal()
    if not renal_df.empty and Ref_Key in renal_df['Reference Key'].values:
        if renal_df.loc[renal_df['Reference Key'] == Ref_Key, 'renal'].iloc[0]:
            return False
    df = df[df['Reference Key'] == Ref_Key].copy()
    if 'LIS Result: Numeric Result' in df.columns and not df.empty:
        df['LIS Reference Datetime'] = pd.to_datetime(df['LIS Reference Datetime'], errors='coerce')
        df['LIS Reference Date'] = df['LIS Reference Datetime'].dt.date
        if deadline:
            baseline = df['LIS Result: Numeric Result'].min() # baseline calculated with all data
            deadline_date = pd.to_datetime(deadline).date()
            df = df[df['LIS Reference Date'] <= deadline_date] # filter data before deadline
            if not df.empty:
                for value in df['LIS Result: Numeric Result']:
                    if value >= baseline * 2:
                        return True
    return False

In [100]:
def check_bili(Ref_Key, df, deadline): # check if bilirubin level is abnormal
    df = df[df['Reference Key'] == Ref_Key].copy()
    if 'LIS Result: Numeric Result' in df.columns and not df.empty:
        df['LIS Reference Datetime'] = pd.to_datetime(df['LIS Reference Datetime'], errors='coerce')
        df['LIS Reference Date'] = df['LIS Reference Datetime'].dt.date
        if deadline:
            baseline = df['LIS Result: Numeric Result'].min() # baseline calculated with all data
            deadline_date = pd.to_datetime(deadline).date()
            df = df[df['LIS Reference Date'] <= deadline_date] # filter data before deadline
            if not df.empty:
                for value in df['LIS Result: Numeric Result']:
                    value_mgdl = value * 0.058467  # Convert umol/L to mg/dL
                    baseline_mgdl = baseline * 0.058467
                    if value_mgdl >= 2.0 and value_mgdl >= baseline_mgdl * 2:
                        return True
    return False

In [101]:
def check_plt(Ref_Key, df, deadline): # check if platelet count is abnormal
    df = df[df['Reference Key'] == Ref_Key].copy()
    if 'LIS Result: Numeric Result' in df.columns and not df.empty:
        df['LIS Reference Datetime'] = pd.to_datetime(df['LIS Reference Datetime'], errors='coerce')
        df['LIS Reference Date'] = df['LIS Reference Datetime'].dt.date
        if deadline:
            deadline_date = pd.to_datetime(deadline).date()
            baseline = df['LIS Result: Numeric Result'].max() # baseline calculated with all data
            df = df[df['LIS Reference Date'] <= deadline_date] # filter data before deadline
            if not df.empty:
                if baseline >= 100:  # baseline must be ≥100 cells/µL
                    for value in df['LIS Result: Numeric Result']:
                        if value < 100 and value <= baseline * 0.5:
                            return True
    return False

In [102]:
def check_lact(Ref_Key, df, deadline): # check if lactate level is abnormal
    df = df[df['Reference Key'] == Ref_Key].copy()
    if 'LIS Result: Numeric Result' in df.columns and not df.empty:
        df['LIS Reference Datetime'] = pd.to_datetime(df['LIS Reference Datetime'], errors='coerce')
        df['LIS Reference Date'] = df['LIS Reference Datetime'].dt.date
        if deadline:
            deadline_date = pd.to_datetime(deadline).date()
            df = df[df['LIS Reference Date'] <= deadline_date] # filter data before deadline
            if not df.empty:
                for value in df['LIS Result: Numeric Result']:
                    if value >= 2.0:
                        return True
    return False

In [92]:
def check_ase(year):
    KEY_PATH = DATA_PATH + f"demographic/demographic_{year}.csv" # assuming demographic file contains all Reference Keys
    df = pd.read_csv(KEY_PATH )
    keys = df['Reference Key'].unique()
    DATE_PATH = DATA_PATH + f"demographic/demographic_{year}.csv" # admission date placeholder for blood culture date
    date_df = pd.read_csv(DATE_PATH) # deadline date
    INOTROPE_PATH = DATA_PATH + f"inotrope/csv/inotrope_{year}.csv"
    inotrope_df = pd.read_csv(INOTROPE_PATH)
    PROCEDURE_PATH = DATA_PATH + f"procedure/csv/procedure_{year}.csv" # no date?
    procedure_df = pd.read_csv(PROCEDURE_PATH)
    CREATININE_PATH = DATA_PATH + f"creatinine/csv/creatinine_{year}.csv"
    creatinine_df = pd.read_csv(CREATININE_PATH)
    BILI_PATH = DATA_PATH + f"Bili_Plt/csv_file/bili/bili_{year}.csv"
    bili_df = pd.read_csv(BILI_PATH)
    PLT_PATH = DATA_PATH + f"Bili_Plt/csv_file/plt/plt_{year}.csv"
    plt_df = pd.read_csv(PLT_PATH)
    LACT_PATH = DATA_PATH + f"Lact/csv/lact_{year}.csv"
    lact_df = pd.read_csv(LACT_PATH)
    
    results = []
    for key in keys: 
        deadline = patient_date_deadline(key, date_df) # calculate deadline
        if check_vasopressor(key, inotrope_df, deadline): # check vasopressor
            results.append({'Reference Key': key, 'vasopressor': True})
        else:
            results.append({'Reference Key': key, 'vasopressor': False})

        if check_ventilation(key, procedure_df, deadline): # check ventilation
            results[-1]['ventilation'] = True
        else:
            results[-1]['ventilation'] = False

        if check_creatinine(key, creatinine_df, deadline): # check creatinine
            results[-1]['creatinine'] = True
        else:
            results[-1]['creatinine'] = False

        if check_bili(key, bili_df, deadline): # check bilirubin
            results[-1]['bili'] = True
        else:
            results[-1]['bili'] = False

        if check_plt(key, plt_df, deadline): # check platelet
            results[-1]['plt'] = True
        else: 
            results[-1]['plt'] = False

        if check_lact(key, lact_df, deadline): # check lactate
            results[-1]['lact'] = True
        else: 
            results[-1]['lact'] = False
        
        print(f"Checked Reference Key: {key}")

    results_df = pd.DataFrame(results)
    output_file = os.path.join(DATA_PATH, f'ase_{year}.csv')
    results_df.to_csv(output_file, index=False)
    print(f"ASE vasopressor results saved to {output_file}")

In [17]:
merge_demographic(year)
data_types = ["inotrope", "procedure", "creatinine", "bili", "plt", "lact"]
for data_type in data_types:
    merge_data(data_type, year)

Processing year: 2024
Found 35 files for year 2024
Saved combined data for year 2024 to ./presumed_infection/2024/demographic/demographic_2024.csv, 714449 total rows
Processing year: 2024
Found 34 files for year 2024
Saved combined data for year 2024 to ./presumed_infection/2024/inotrope/csv/inotrope_2024.csv, 147188 total rows
Processing year: 2024
Found 35 files for year 2024
Saved combined data for year 2024 to ./presumed_infection/2024/procedure/csv/procedure_2024.csv, 214215 total rows
Processing year: 2024
Found 35 files for year 2024
Saved combined data for year 2024 to ./presumed_infection/2024/creatinine/csv/creatinine_2024.csv, 1210378 total rows
Processing year: 2024
Found 35 files for year 2024
Saved combined data for year 2024 to ./presumed_infection/2024/Bili_Plt/csv_file/bili/bili_2024.csv, 857258 total rows
Processing year: 2024
Found 35 files for year 2024
Saved combined data for year 2024 to ./presumed_infection/2024/Bili_Plt/csv_file/plt/plt_2024.csv, 1059032 total r

In [103]:
warnings.filterwarnings('ignore')
check_ase(2024)

Checked Reference Key: 13666
Checked Reference Key: 14191
Checked Reference Key: 15199
Checked Reference Key: 16095
Checked Reference Key: 18626
Checked Reference Key: 18903
Checked Reference Key: 20209
Checked Reference Key: 25981
Checked Reference Key: 26473
Checked Reference Key: 27600
Checked Reference Key: 28117
Checked Reference Key: 31709
Checked Reference Key: 33364
Checked Reference Key: 35369
Checked Reference Key: 35830
Checked Reference Key: 38276
Checked Reference Key: 38752
Checked Reference Key: 39561
Checked Reference Key: 40617
Checked Reference Key: 41122
Checked Reference Key: 42540
Checked Reference Key: 48714
Checked Reference Key: 51655
Checked Reference Key: 51682
Checked Reference Key: 51994
Checked Reference Key: 53101
Checked Reference Key: 54297
Checked Reference Key: 56368
Checked Reference Key: 57364
Checked Reference Key: 57407
Checked Reference Key: 58065
Checked Reference Key: 59761
Checked Reference Key: 61749
Checked Reference Key: 64332
Checked Refere