In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

from imblearn.under_sampling import RandomUnderSampler

In [None]:
DATABASE_ADDRESS = "../Data/raw/chartevents.csv"
PROCEDURE_EVENTS_ADDRESS = "../Data/raw/procedureevents.csv"
CODE_MAPPING_ADDRESS = "../Data/raw/code_mappings.csv"
DATA_SIZE = 6000000 # This database has 329499788 lines

FILENAME = datetime.now().strftime('%Y%m%d%H%M') + f'_{DATA_SIZE:.0e}'
OUTPUT_ADDRESS = f"data/{FILENAME}.csv"

print(f"file name: {FILENAME}")

In [None]:
mark = ("Heart Rate", \
        "Respiratory Rate", \
        "Non Invasive Blood Pressure systolic", \
        "Non Invasive Blood Pressure diastolic", \
        "O2 saturation pulseoxymetry")
         # drop "Temperature Celsius", "Temperature Fahrenheit"

itemid_map = {  "Cardiac Arrest"                        : 225466, \
                "Heart Rate"                            : 220045, \
                "Respiratory Rate"                      : 220210, \
                "Non Invasive Blood Pressure systolic"  : 220179, \
                "Non Invasive Blood Pressure diastolic" : 220180, \
                "O2 saturation pulseoxymetry"           : 220277}

In [None]:
procedure_events_df = pd.read_csv(PROCEDURE_EVENTS_ADDRESS, usecols=["hadm_id", "starttime", "itemid"], parse_dates=["starttime"], engine="python", encoding="unicode_escape")
Arrested_patient = procedure_events_df[procedure_events_df["itemid"]==itemid_map["Cardiac Arrest"]].drop(columns="itemid")
Arrested_patient.sample(5)

In [None]:
def _init(df):
    condition = df["itemid"].isin([itemid_map[i] for i in mark])
    df = df.loc[condition]
    df.loc[:, 'value'] = pd.to_numeric(df['value'], errors="coerce")
    df = df[df['value']!=0]
    # df.replace('0.00', np.nan, inplace=True
    # df.replace('', np.nan, inplace=True)
    # df.replace(' ', np.nan, inplace=True)
    # df.dropna(inplace=True)

    df.loc[:, 'charttime'] = df['charttime'].dt.round("10min")
    return df
def _pivot_table(df):
    df = df.dropna(thresh=3)
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    df.loc[df['value']==0, 'value'] = np.nan
    try:
        result = df.pivot_table(index=["hadm_id", "charttime"], columns="itemid", values="value")
    except TypeError:
        print("BAD")
        result = df.pivot_table(index=["hadm_id", "charttime"], aggfunc="mean", columns="itemid", values="value")
    result = result.dropna(thresh=3)
    return result
def _interpolated(df):
    result = df.groupby("hadm_id").apply(lambda x: x.interpolate()).reset_index(level=0, drop=True)
    result.dropna(how="any", inplace=True)
    return result
def _add_label(df):
    df['label'] = 0 # 0->None, 1->warning time, 2->other arrest time
    for idx in Arrested_patient.index:
        hadm_id = Arrested_patient['hadm_id'][idx]
        starttime = Arrested_patient['starttime'][idx]
        hadm_id_condition = df.index.get_level_values('hadm_id') == hadm_id
        starttime_condition = df.index.get_level_values('charttime') >= starttime - pd.DateOffset(hours=1, minutes=10)
        endtime_condition = df.index.get_level_values('charttime') <= starttime - pd.DateOffset(hours=0)
        df.loc[hadm_id_condition, 'label'] = 1
        df.loc[hadm_id_condition & starttime_condition & endtime_condition, 'label'] = 2
    return df

In [None]:
df = pd.DataFrame()
with pd.read_csv(   DATABASE_ADDRESS, \
                    usecols=["hadm_id", "charttime", "itemid", "value"], \
                    parse_dates=["charttime"], \
                    encoding="unicode_escape", \
                    chunksize=DATA_SIZE, \
                    low_memory=False
                    ) as reader:
    for chunk in tqdm(reader):
        chunk = _init(chunk)
        if len(chunk) == 0:
            continue
        chunk_pivot = _pivot_table(chunk)
        chunk_interpolated = _interpolated(chunk_pivot)
        chunk_labeled = _add_label(chunk_interpolated)
        chunk_labeled = chunk_labeled[~(chunk_labeled['label']==1)]
        if len(chunk_labeled[chunk_labeled['label']==2])>0:
            df_ = chunk_labeled.drop(columns="label")
            df_label = chunk_labeled["label"]==2
            df_resample, df_label_resample = RandomUnderSampler().fit_resample(df_, df_label)
            df_resample["label"] = df_label_resample
            df = pd.concat([df, df_resample])
 
df.to_csv(OUTPUT_ADDRESS, float_format='%.2f', index=0)