In [None]:
import pandas as pd
from datetime import datetime
from imblearn.under_sampling import RandomUnderSampler

In [None]:
DATABASE_ADDRESS = "../Data/raw/chartevent_part1.csv"
PROCEDURE_EVENTS_ADDRESS = "../Data/raw/procedureevents.csv"
CODE_MAPPING_ADDRESS = "../Data/raw/code_mappings.csv"
DATA_SIZE = 1*1000000

FILENAME = datetime.now().strftime('%Y%m%d') + f'_{DATA_SIZE:.0e}'
OUTPUT_FILLED_ADDRESS = f"../Data/Preprocessed/chartevents_{FILENAME}_filled.csv"
OUTPUT_LABELED_ADDRESS = f"../Data/Preprocessed/chartevents_{FILENAME}_labeled.csv"

print(f"file name: {FILENAME}")

In [None]:
mark = ("Heart Rate", \
        "Respiratory Rate", \
        "Non Invasive Blood Pressure systolic", \
        "Non Invasive Blood Pressure diastolic", \
        "O2 saturation pulseoxymetry")

item_id_map = {"Cardiac Arrest" : 225466}

###### 20大概需要14分鐘
###### 1需要30.5MB
###### 大檔案有 329,499789行

In [None]:
%%time 
df = pd.read_csv(DATABASE_ADDRESS, usecols=["hadm_id", "charttime", "itemid", "value"], parse_dates=["charttime"], engine="python", encoding="unicode_escape", nrows=DATA_SIZE)
df.info()

In [None]:
df['charttime'] = df['charttime'].dt.round("10min")
df = df[df["itemid"].isin(mark)]
df.sample(5)

In [None]:
df_pivot = df.pivot_table(index=["hadm_id", "charttime"], columns="itemid", values="value")
df_pivot.dropna(thresh=3, inplace=True)

df_interpolated = df_pivot.groupby("hadm_id").apply(lambda x: x.interpolate()).reset_index(level=0, drop=True)
df_interpolated.dropna(how="any", inplace=True)
df_interpolated.to_csv(OUTPUT_FILLED_ADDRESS, float_format='%.2f', index=0)
df_interpolated.sample(5)

In [None]:
df_interpolated

In [None]:
procedure_events_df = pd.read_csv(PROCEDURE_EVENTS_ADDRESS, usecols=["hadm_id", "starttime", "itemid"], parse_dates=["starttime"], engine="python", encoding="unicode_escape")
df_labeled_df = df_interpolated
Arrested_patient = procedure_events_df[procedure_events_df["itemid"]==item_id_map["Cardiac Arrest"]].drop(columns="itemid")

In [None]:
df_labeled_df['label'] = False
for index, row in Arrested_patient.iterrows():
    hadm_id = row['hadm_id']
    starttime = row['starttime']
    # print(f"hadm_id: {hadm_id}, starttime: {starttime}")
    hadm_id_condition = df_interpolated.index.get_level_values('hadm_id') == hadm_id
    starttime_condition = df_interpolated.index.get_level_values('charttime') >= starttime
    endtime_condition = df_interpolated.index.get_level_values('charttime') <= starttime -pd.DateOffset(hour=4)
    
    df_labeled_df.loc[hadm_id_condition & starttime_condition & endtime_condition, 'label'] = True
df_labeled_df.to_csv(OUTPUT_LABELED_ADDRESS, float_format='%.2f', index=0)
df_labeled_df['label'].value_counts()

In [None]:
df_under, df_under_label = df_labeled_df.drop(columns="label"), df_labeled_df["label"]
df_under_resample, df_under_label_resample = RandomUnderSampler(sampling_strategy='majority').fit_resample(df_under, df_under_label)

In [None]:
df_under_resample.info()