In [1]:
import pandas as pd
import datetime
from tqdm import tqdm
import numpy as np

from imblearn.under_sampling import RandomUnderSampler

In [2]:
DATABASE_ADDRESS = "../Data/raw/chartevents.csv"
PROCEDURE_EVENTS_ADDRESS = "../Data/raw/procedureevents.csv"
CODE_MAPPING_ADDRESS = "../Data/raw/code_mappings.csv"
DATA_SIZE = 15000000

FILENAME = datetime.datetime.now().strftime('%Y%m%d%H%M') + f'_{DATA_SIZE:.0e}'
OUTPUT_ADDRESS = f"data/{FILENAME}.csv"

print(f"file name: {FILENAME}")

file name: 202403141348_2e+07


In [3]:
mark = ("Heart Rate", \
        "Respiratory Rate", \
        "Non Invasive Blood Pressure systolic", \
        "Non Invasive Blood Pressure diastolic", \
        "O2 saturation pulseoxymetry")
         # drop "Temperature Celsius", "Temperature Fahrenheit"
itemid_map = {  "Cardiac Arrest"                        : 225466, \
                "Heart Rate"                            : 220045, \
                "Respiratory Rate"                      : 220210, \
                "Non Invasive Blood Pressure systolic"  : 220179, \
                "Non Invasive Blood Pressure diastolic" : 220180, \
                "O2 saturation pulseoxymetry"           : 220277}

###### 20大概需要14分鐘
###### 1需要30.5MB
###### 大檔案有 329,499789行

In [4]:
procedure_events_df = pd.read_csv(PROCEDURE_EVENTS_ADDRESS, usecols=["hadm_id", "starttime", "itemid"], parse_dates=["starttime"], engine="python", encoding="unicode_escape")
Arrested_patient = procedure_events_df[procedure_events_df["itemid"]==itemid_map["Cardiac Arrest"]].drop(columns="itemid")

In [5]:
Arrested_patient

Unnamed: 0,hadm_id,starttime
4869,21758160,2131-03-31 03:20:00
5606,28621351,2113-05-03 09:10:00
6862,24697159,2186-05-17 22:17:00
8543,20108756,2161-01-04 00:04:00
10916,28411958,2126-06-18 10:29:00
...,...,...
685185,23580066,2148-06-15 17:58:00
686571,24776727,2124-04-24 17:00:00
686572,24196469,2171-12-10 19:50:00
686573,23717261,2166-04-12 22:07:00


In [6]:
FILENAME

'202403141348_2e+07'

In [7]:
def _init(df):
    try:
        df.replace(' ', np.nan, inplace=True)
        df.replace('', np.nan, inplace=True)
        df.dropna(inplace=True)
        df['value'].astype(float)
    except ValueError:
        df = df[df.map(np.isreal).all(1)]
    
    df['charttime'] = df['charttime'].dt.round("10min")
    df = df[df["itemid"].isin([itemid_map[i] for i in mark])]
    return df
def _pivot_table(df):
    result = df.pivot_table(index=["hadm_id", "charttime"], columns="itemid", values="value")
    result.dropna(thresh=3, inplace=True)
    return result
def _interpolated(df):
    result = df.groupby("hadm_id").apply(lambda x: x.interpolate()).reset_index(level=0, drop=True)
    result.dropna(how="any", inplace=True)
    return result
def _add_label(df):
    df['label'] = False 
    for idx in Arrested_patient.index:
        hadm_id = Arrested_patient['hadm_id'][idx]
        starttime = Arrested_patient['starttime'][idx]
        hadm_id_condition = df.index.get_level_values('hadm_id') == hadm_id
        starttime_condition = df.index.get_level_values('charttime') >= starttime
        endtime_condition = df.index.get_level_values('charttime') <= starttime -pd.DateOffset(hour=4)

        df.loc[hadm_id_condition & starttime_condition & endtime_condition, 'label'] = True
    return df

In [8]:
cnt = 0
df = pd.DataFrame()
brk1 = []
brk2 = []
last_cnt = -1
T = None
with pd.read_csv(   DATABASE_ADDRESS, \
                    usecols=["hadm_id", "charttime", "itemid", "value"], \
                    parse_dates=["charttime"], \
                    engine="python", \
                    encoding="unicode_escape", \
                    chunksize=DATA_SIZE) as reader:
    for chunk in tqdm(reader):
        T = chunk
        chunk = _init(chunk)
        if len(chunk) == 0:
            continue
        chunk_pivot = _pivot_table(chunk)
        chunk_interpolated = _interpolated(chunk_pivot)
        chunk_labeled = _add_label(chunk_interpolated)

        if chunk_labeled["label"].any():
            df_ = chunk_labeled.drop(columns="label")
            df_label = chunk_labeled["label"]
            df_resample, df_label_resample = RandomUnderSampler().fit_resample(df_, df_label)
            df_resample["label"] = df_label_resample
            df = pd.concat([df, df_resample])
            
        # cnt += 1
        # if cnt > 5:
        #     break
df.to_csv(OUTPUT_ADDRESS, float_format='%.2f', index=0)

4it [56:20, 901.85s/it]

In [None]:
df