In [1]:
import pandas as pd
from time import mktime, strptime
import datetime

In [2]:
DATABASE_ADDRESS = "../Data/Raw/chartevents.csv"
PROCEDURE_EVENTS_ADDRESS = "../Data/Raw/procedureevents.csv"
CODE_MAPPING_ADDRESS = "../Data/Raw/code_mappings.csv"
DATA_SIZE = 2*1000000

###### 20大概需要14分鐘
###### 1需要30.5MB
###### 大檔案有 329,499789行

In [3]:
FILENAME = datetime.datetime.now().strftime('%Y%m%d') + f'_{DATA_SIZE:.0e}'
FILENAME

'20230710_2e+06'

In [5]:
item = dict()
mark = tuple()

In [6]:
df = pd.read_csv(CODE_MAPPING_ADDRESS, engine="python", encoding="unicode_escape")
id_to_label = dict(zip(df["itemid"], df["label"]))
label_to_id = dict(zip(df["label"], df["itemid"]))
# df.head()

In [7]:
mark = ("Heart Rate", \
        "Respiratory Rate", \
        "Non Invasive Blood Pressure systolic", \
        "Non Invasive Blood Pressure diastolic", \
        "O2 saturation pulseoxymetry", \
        "Temperature Celsius", \
        "Temperature Fahrenheit")

In [8]:
%%time 
df = pd.read_csv(DATABASE_ADDRESS, usecols=["hadm_id", "charttime", "itemid", "value"], engine="python", encoding="unicode_escape", nrows=DATA_SIZE)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   hadm_id    int64  
 1   charttime  object 
 2   itemid     int64  
 3   value      float64
dtypes: float64(1), int64(2), object(1)
memory usage: 61.0+ MB
CPU times: user 6.38 s, sys: 277 ms, total: 6.65 s
Wall time: 6.67 s


In [9]:
df['charttime'] = pd.to_datetime(df['charttime']).round("10min")

In [10]:
df = df[df["itemid"].isin((220045, 220210, 220179, 220180, 220277, 223762, 223761))]

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 604291 entries, 12 to 1999999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   hadm_id    604291 non-null  int64         
 1   charttime  604291 non-null  datetime64[ns]
 2   itemid     604291 non-null  int64         
 3   value      604291 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 23.1 MB


In [12]:
# 建立一個CLASS容器來裝每一個PATIENT
class Patient:
    data = dict()
    def __init__(self, ):
        self.data = {"hadm_id":None, "charttime":None}
        for label_ in mark:
            self.data[label_] = None
    def allna(self, ):
        return all ([self.data[i]==None for i in mark])

In [13]:
output = []
cnt = 0
tp = [label_to_id[i] for i in mark]
for name, group in df.groupby(["hadm_id", "charttime"]):
    patient = Patient()
    
    for id_, value_ in zip(group.itemid, group.value):
        cnt += 1
        patient.data[id_to_label[id_]] = value_
    patient.data["hadm_id"], patient.data["charttime"] = name
    if patient.data["Temperature Fahrenheit"] != None:
        patient.data["Temperature Celsius"] = round((float(patient.data["Temperature Fahrenheit"]) - 32) * 5 / 9, 1)
    output.append(patient.data)
merged_df = pd.DataFrame(output,columns=['hadm_id','charttime']+list(mark)).drop(["Temperature Fahrenheit"], axis=1)
merged_df.to_csv(f"../Data/Preprocessed/chartevents_{FILENAME}_pivot.csv", float_format='%.2f', index=0)

In [14]:
# df.to_csv("chart_test3_merge_new.csv", float_format='%.2f', index=0)
print("amount: ", len(output))
print("amount: ", len(merged_df))
# merged_df

amount:  140901
amount:  140901


In [15]:
df.sample(5)

Unnamed: 0,hadm_id,charttime,itemid,value
1598046,27353703,2161-01-18 09:00:00,220179,93.0
668336,27762266,2140-10-05 11:00:00,220277,92.0
841809,26015355,2122-07-28 20:30:00,220045,112.0
843898,27444036,2129-05-05 13:00:00,220180,113.0
392414,27900881,2136-07-12 13:00:00,220045,82.0


In [16]:
df = pd.read_csv(f"../Data/Preprocessed/chartevents_{FILENAME}_pivot.csv", engine="python", encoding="unicode_escape", parse_dates=["charttime"])
# df.dropna(thresh=5, inplace=True)
for name, group in df.groupby("hadm_id"):
    group.interpolate(method='bfill', inplace=True)
    group.interpolate(method='ffill', inplace=True)
    df.update(group)
df.to_csv(f"../Data/Preprocessed/chartevents_{FILENAME}_filled.csv", float_format='%.1f', index=0)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140901 entries, 0 to 140900
Data columns (total 8 columns):
 #   Column                                 Non-Null Count   Dtype         
---  ------                                 --------------   -----         
 0   hadm_id                                140901 non-null  int64         
 1   charttime                              140901 non-null  datetime64[ns]
 2   Heart Rate                             140901 non-null  float64       
 3   Respiratory Rate                       140901 non-null  float64       
 4   Non Invasive Blood Pressure systolic   140692 non-null  float64       
 5   Non Invasive Blood Pressure diastolic  140692 non-null  float64       
 6   O2 saturation pulseoxymetry            140897 non-null  float64       
 7   Temperature Celsius                    140604 non-null  float64       
dtypes: datetime64[ns](1), float64(6), int64(1)
memory usage: 8.6 MB


In [18]:
def read_csv_function(road: str) -> pd.core.frame.DataFrame:
    print("Reading", road, "...")
    return pd.read_csv(road, engine='python')
def to_timestamp(s: str):
    try:
        return int(mktime(strptime(s, "%Y-%m-%d %H:%M:%S")))
    except ValueError:
        try:
            return int(mktime(strptime(s, "%Y-%m-%d %H:%M")))
        except ValueError:
            return int(mktime(strptime(s, "%Y/%m/%d %H:%M")))

In [19]:
df = read_csv_function(PROCEDURE_EVENTS_ADDRESS)
df["starttime"] = pd.to_datetime(df["starttime"])
dic = {}
for i, it in enumerate(df.itemid):
    if it != 225466:
        continue
    if df.iloc[i].hadm_id not in dic:
        dic[df.iloc[i].hadm_id] = list()
    dic[df.iloc[i].hadm_id].append(df.iloc[i].starttime)

df = read_csv_function(f"../Data/Preprocessed/chartevents_{FILENAME}_filled.csv")
df["charttime"] = pd.to_datetime(df["charttime"])
ans = [False]*len(df)
for i in range(len(df)):
    if int(df.iloc[i].hadm_id) in dic:
        for j in dic[df.iloc[i].hadm_id]:
            time_diff = int(pd.Timedelta(j - df.iloc[i].charttime).total_seconds())
            if 0 <= time_diff <= 14400:
                ans[i] = True
                break

df["label"] = ans
df.to_csv(f"../Data/Preprocessed/chartevents_{FILENAME}_labeled.csv", float_format='%.2f', index=0)

Reading ../Data/Raw/procedureevents.csv ...
Reading ../Data/Preprocessed/chartevents_20230709_2e+06_filled.csv ...


In [20]:
df.label.value_counts()

label
False    140889
True         12
Name: count, dtype: int64