In [2]:
import pandas as pd
import datetime
from tqdm import tqdm
from time import sleep
import numpy as np
import matplotlib


from imblearn.under_sampling import RandomUnderSampler

In [3]:
DATABASE_ADDRESS = "../Data/raw/chartevents.csv"
PROCEDURE_EVENTS_ADDRESS = "../Data/raw/procedureevents.csv"
CODE_MAPPING_ADDRESS = "../Data/raw/code_mappings.csv"
DATA_SIZE = 1000000

FILENAME = datetime.datetime.now().strftime('%Y%m%d') + f'_{DATA_SIZE:.0e}'
OUTPUT_FILLED_ADDRESS = f"../Data/Preprocessed/chartevents_{FILENAME}_filled.csv"
OUTPUT_INTERPOLATED_ADDRESS = f"../Data/Preprocessed/chartevents_{FILENAME}_interpolated.csv"
OUTPUT_LABELED_ADDRESS = f"../Data/Preprocessed/chartevents_{FILENAME}_labeled.csv"

print(f"file name: {FILENAME}")

file name: 20240310_1e+06


In [4]:
mark = ("Heart Rate", \
        "Respiratory Rate", \
        "Non Invasive Blood Pressure systolic", \
        "Non Invasive Blood Pressure diastolic", \
        "O2 saturation pulseoxymetry")
         # drop "Temperature Celsius", "Temperature Fahrenheit"
itemid_map = {  "Cardiac Arrest"                        : 225466, \
                "Heart Rate"                            : 220045, \
                "Respiratory Rate"                      : 220210, \
                "Non Invasive Blood Pressure systolic"  : 220179, \
                "Non Invasive Blood Pressure diastolic" : 220180, \
                "O2 saturation pulseoxymetry"           : 220277}

###### 20大概需要14分鐘
###### 1需要30.5MB
###### 大檔案有 329,499789行

In [5]:
procedure_events_df = pd.read_csv(PROCEDURE_EVENTS_ADDRESS, usecols=["hadm_id", "starttime", "itemid"], parse_dates=["starttime"], engine="python", encoding="unicode_escape")
Arrested_patient = procedure_events_df[procedure_events_df["itemid"]==itemid_map["Cardiac Arrest"]].drop(columns="itemid")

In [6]:
Arrested_patient

Unnamed: 0,hadm_id,starttime
4869,21758160,2131-03-31 03:20:00
5606,28621351,2113-05-03 09:10:00
6862,24697159,2186-05-17 22:17:00
8543,20108756,2161-01-04 00:04:00
10916,28411958,2126-06-18 10:29:00
...,...,...
685185,23580066,2148-06-15 17:58:00
686571,24776727,2124-04-24 17:00:00
686572,24196469,2171-12-10 19:50:00
686573,23717261,2166-04-12 22:07:00


In [7]:
FILENAME

'20240310_1e+06'

In [15]:
cnt = 0
brk1 = []
brk2 = []
last_cnt = -1
df = pd.DataFrame()
df_label = pd.DataFrame()
T = None
with pd.read_csv(DATABASE_ADDRESS, usecols=["hadm_id", "charttime", "itemid", "value"], parse_dates=["charttime"], engine="python", encoding="unicode_escape", chunksize=DATA_SIZE) as reader:
    for chunk in tqdm(reader):
        T= chunk

        try:
            if cnt > last_cnt:
                chunk.replace(' ', np.nan, inplace=True)
                chunk.dropna(inplace=True)
                chunk["value"].astype(float)
                chunk['charttime'] = chunk['charttime'].dt.round("10min")
                chunk = chunk[chunk["itemid"].isin([itemid_map[i] for i in mark])]
                chunk_pivot = chunk.pivot_table(index=["hadm_id", "charttime"], columns="itemid", values="value")
                chunk_pivot.dropna(thresh=3, inplace=True)
                chunk_interpolated = chunk_pivot.groupby("hadm_id").apply(lambda x: x.interpolate()).reset_index(level=0, drop=True)
                chunk_interpolated.dropna(how="any", inplace=True)

                chunk_interpolated['label'] = False
                for index, row in Arrested_patient.iterrows():
                    hadm_id = row['hadm_id']
                    starttime = row['starttime']
                    hadm_id_condition = chunk_interpolated.index.get_level_values('hadm_id') == hadm_id
                    starttime_condition = chunk_interpolated.index.get_level_values('charttime') >= starttime
                    endtime_condition = chunk_interpolated.index.get_level_values('charttime') <= starttime -pd.DateOffset(hour=2)

                    chunk_interpolated.loc[hadm_id_condition & starttime_condition & endtime_condition, 'label'] = True
                
                df_under, df_under_label = chunk_interpolated.drop(columns="label"), chunk_interpolated["label"]
                df_under_resample, df_under_label_resample = RandomUnderSampler(sampling_strategy='majority').fit_resample(df_under, df_under_label)
                df_under_resample["label"] = df_under_label_resample
                df = pd.concat([df, df_under_resample])
        except ValueError:
            # print("No label at", str(cnt))
            cnt += 1
            brk1.append(cnt)
            sleep(2)
            continue
        except TypeError:
            print("ERROR at", str(cnt))
            cnt += 1
            brk2.append(cnt)
            sleep(2)
            continue
        cnt += 1
        if cnt > 150:
            break
df.to_csv(f"../Data/resampled/{FILENAME}.csv", float_format='%.2f', index=0)

  for obj in iterable:
166it [16:24,  5.93s/it]


In [12]:
cnt

167

In [19]:
set(range(1, 168)) - set(brk1)

{9, 10, 12, 47, 48, 88, 95, 131, 132, 134, 167}

In [13]:
brk2

156

In [36]:
df

Unnamed: 0_level_0,itemid,220045,220179,220180,220210,220277,label
hadm_id,charttime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
28683663,2150-04-22 16:30:00,111.0,110.000000,62.000000,29.0,96.0,False
26326357,2144-10-27 19:00:00,117.0,94.000000,59.000000,28.0,96.0,False
24444070,2182-02-24 13:00:00,74.0,143.000000,86.000000,39.0,97.0,False
25482501,2179-07-27 21:00:00,83.0,109.000000,70.000000,15.0,100.0,False
27409048,2155-06-02 17:20:00,106.0,70.096774,40.983871,21.0,94.0,False
...,...,...,...,...,...,...,...
23911300,2185-04-14 02:00:00,132.0,140.000000,68.000000,14.0,51.0,True
23911300,2185-04-14 03:00:00,106.0,140.000000,68.000000,14.0,99.0,True
23911300,2185-04-14 03:50:00,90.0,140.000000,68.000000,14.0,99.0,True
23911300,2185-04-14 04:00:00,74.0,140.000000,68.000000,14.0,98.0,True


In [35]:
df["label"].value_counts()

label
False    193
True     193
Name: count, dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 32 entries, (27285076, Timestamp('2140-08-28 18:00:00')) to (25355565, Timestamp('2192-09-22 04:00:00'))
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   220045  32 non-null     float64
 1   220179  32 non-null     float64
 2   220180  32 non-null     float64
 3   220210  32 non-null     float64
 4   220277  32 non-null     float64
 5   label   32 non-null     bool   
dtypes: bool(1), float64(5)
memory usage: 691.9 KB


In [34]:
df["label"].value_counts()

label
False    193
True     193
Name: count, dtype: int64

In [None]:
cnt = 0
last_cnt = -1
df = pd.DataFrame()
df_label = pd.DataFrame()
DATA_SIZE = 100
with pd.read_csv(DATABASE_ADDRESS, usecols=["hadm_id", "charttime", "itemid", "value"], parse_dates=["charttime"], engine="python", encoding="unicode_escape", chunksize=DATA_SIZE) as reader:
    chunk.replace(' ', np.nan, inplace=True)
    chunk.dropna(inplace=True)
    for chunk in tqdm(reader):
        print(chunk["value"].dtype)
        
        cnt += 1
        if cnt > 50000:
            break
        

In [11]:
cnt

35

In [38]:
df["value"][50:60]

4167350     76
4167351     17
4167352     96
4167353     86
4167354    111
4167355     57
4167356     74
4167357       
4167358     61
4167359     70
Name: value, dtype: object

In [53]:
df2 = df["value"]

In [56]:
df2.replace(' ', np.nan, inplace=True)

In [57]:
df2[50:60]

4167350     76
4167351     17
4167352     96
4167353     86
4167354    111
4167355     57
4167356     74
4167357    NaN
4167358     61
4167359     70
Name: value, dtype: object

In [None]:
procedure_events_df = pd.read_csv(PROCEDURE_EVENTS_ADDRESS, usecols=["hadm_id", "starttime", "itemid"], parse_dates=["starttime"], engine="python", encoding="unicode_escape")
df_labeled_df = df_interpolated
Arrested_patient = procedure_events_df[procedure_events_df["itemid"]==itemid_map["Cardiac Arrest"]].drop(columns="itemid")

In [None]:
df_labeled_df['label'] = False
for index, row in Arrested_patient.iterrows():
    hadm_id = row['hadm_id']
    starttime = row['starttime']
    # print(f"hadm_id: {hadm_id}, starttime: {starttime}")
    hadm_id_condition = df_interpolated.index.get_level_values('hadm_id') == hadm_id
    starttime_condition = df_interpolated.index.get_level_values('charttime') >= starttime
    endtime_condition = df_interpolated.index.get_level_values('charttime') <= starttime -pd.DateOffset(hour=4)
    
    df_labeled_df.loc[hadm_id_condition & starttime_condition & endtime_condition, 'label'] = True
df_labeled_df.to_csv(OUTPUT_LABELED_ADDRESS, float_format='%.1f', index=0)
df_labeled_df['label'].value_counts()

label
False    587089
True         17
Name: count, dtype: int64