## Import package

In [1]:
import pandas as pd
import os
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import gc
from multiprocessing import Process

pd.options.mode.chained_assignment = None

In [2]:
preprocess_path = '../data/mimiciii/preprocess_data/'
temporal_path = '../data/mimiciii/temporal_dataset/'

## Load Data

In [3]:
datas = list()
paths = sorted(os.listdir(preprocess_path))
for path in tqdm(paths):
    datas.append(pd.read_csv(preprocess_path + path))

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:51<00:00,  1.89s/it]


# Look Data characteristics

In [4]:
for i in tqdm(range(len(datas))):
    for column in datas[i].columns:
        if 'TIME' in column or 'DATE' in column or 'DOB' == column or 'DOD' == column or 'DOD_HOSP' == column or 'DOD_SSN' == column:
            datas[i][column] = datas[i][column].apply(lambda x : pd.Timestamp(x))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [03:23<00:00,  7.55s/it]


In [5]:
i = paths.index('D_ICD_DIAGNOSES.csv')
D_ICD_DIAGNOSES = datas[i].copy()

i = paths.index('DIAGNOSES_ICD.csv')
DIAGNOSES_ICD = datas[i].copy()

i = paths.index('ICUSTAYS.csv')
ICUSTAYS = datas[i].copy()

i = paths.index('PATIENTS.csv')
PATIENTS = datas[i].copy()

i = paths.index('ADMISSIONS.csv')
ADMISSIONS = datas[i].copy()

# Load dataset

In [6]:
dataset = pd.read_csv('../data/mimiciii/patient.csv')

## Add gender and age to dataset

In [7]:
dataset['DOB'] = np.nan
dataset['Gender'] = np.nan

for i in PATIENTS.index:
    subject_id = PATIENTS['SUBJECT_ID'].loc[i]
    gender = PATIENTS['GENDER'].loc[i]
    dob = PATIENTS['DOB'].loc[i]
    index = dataset[dataset['SUBJECT_ID'] == subject_id].index
    dataset['DOB'].loc[index] = dob
    dataset['Gender'].loc[index] = gender
    
dataset['Gender'] = dataset['Gender'].apply(lambda x : 1 if x == 'M' else 0)

## Add death time, admit time, discharge time to dataset

In [8]:
dataset['DEATHTIME'] = np.nan
dataset['ADMITTIME'] = np.nan
dataset['DISCHTIME'] = np.nan

for i in ADMISSIONS.index:
    subject_id = ADMISSIONS['SUBJECT_ID'].loc[i]
    hadm_id = ADMISSIONS['HADM_ID'].loc[i]
    death_time = ADMISSIONS['DEATHTIME'].loc[i]
    admittime = ADMISSIONS['ADMITTIME'].loc[i]
    dischtime = ADMISSIONS['DISCHTIME'].loc[i]
    index = dataset.query(f'SUBJECT_ID == {subject_id} & HADM_ID == {hadm_id}').index
    dataset['DEATHTIME'].loc[index] = death_time
    dataset['ADMITTIME'].loc[index] = admittime
    dataset['DISCHTIME'].loc[index] = dischtime

## change time by ICUSTAY_ID in dataset

In [9]:
for i in ICUSTAYS.index:
    subject_id = ICUSTAYS['SUBJECT_ID'].loc[i]
    hadm_id = ICUSTAYS['HADM_ID'].loc[i]
    intime = ICUSTAYS['INTIME'].loc[i]
    outtime = ICUSTAYS['OUTTIME'].loc[i]
    index = dataset.query(f'SUBJECT_ID == {subject_id} & HADM_ID == {hadm_id}').index
    if dataset['ADMITTIME'].isnull()[index[0]]:
        dataset['ADMITTIME'].loc[index[0]] = intime
    elif dataset['ADMITTIME'].loc[index[0]] - intime > pd.Timedelta('00:00:00'):
        dataset['ADMITTIME'].loc[index[0]] = intime
    if dataset['DISCHTIME'].isnull()[index[0]]:
        dataset['DISCHTIME'].loc[index[0]] = outtime
    elif dataset['DISCHTIME'].loc[index[0]] - outtime < pd.Timedelta('00:00:00'):
        dataset['DISCHTIME'].loc[index[0]] = outtime

## Add re-admission to dataset

In [10]:
dataset['re_admission'] = np.nan

dataset['re_admission'].loc[0] = 0
for i in dataset.index[1:]:
    if dataset['SUBJECT_ID'].loc[i] == dataset['SUBJECT_ID'].loc[i - 1]:
        dataset['re_admission'].loc[i] = 1
    else:
        dataset['re_admission'].loc[i] = 0

## Add elixhauser to dataset (Use SID score)

In [11]:
DIAGNOSES_ICD.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,130,21,111970,1.0,388
1,131,21,111970,2.0,78552
2,132,21,111970,3.0,40391
3,133,21,111970,4.0,42731
4,134,21,111970,5.0,70709


In [12]:
elixhauser_ICD_CODE = pd.read_csv('../data/mimiciii/elixhauser_ICD_CODE.csv')
elixhauser_ICD_CODE.head()

Unnamed: 0,elixhauser instructions,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,aids,7700.0,8322,Nursemaid's elbow,Nursemaid's elbow
1,aids,12795.0,V5864,Long-term anti-inflamtry,Long-term (current) use of non-steroidal anti-...
2,aids,11997.0,V532,Adjustment hearing aid,Fitting and adjustment of hearing aid
3,alcohol abuse,2895.0,30500,Alcohol abuse-unspec,"Alcohol abuse, unspecified"
4,alcohol abuse,2896.0,30501,Alcohol abuse-continuous,"Alcohol abuse, continuous"


In [13]:
coef_dict = {
    'aids' : 0,
    'alcohol abuse' : 0,
    'blood loss anemias' : -3,
    'cardiac arrhythmias' : 8,
    'congestive heart failure' : 9,
    'chronic pulmonary' : 3,
    'coagulopathy' : 12,
    'deficiency anemias' : 0,
    'depression' : -5,
    'diabetes complicated' : 1,
    'diabetes uncomplicated' : 0,
    'drug abuse' : -11,
    'fluid electrolyte' : 11,
    'hypertension' : -2,
    'hypothyroidism' : 0,
    'liver disease' : 7,
    'lymphoma' : 8,
    'metastatic cancer' : 17,
    'other neurological' : 5,
    'obesity' : -5,
    'paralysis' : 4,
    'peptic ucler' : 0,
    'peripheral vascular' : 4,
    'psychosis' : -6,
    'pulmonary circulation' : 5,
    'renal failure' : 7,
    'rheumatoid arthritis' : 0,
    'solid tumor' : 10,
    'valvular_disease' : 0,
    'weight_loss' : 10
}

def get_elixhauser(series, elixhauser_ICD_CODE):
    subject_id = series['SUBJECT_ID']
    hadm_id = series['HADM_ID']
    icd_code_set = set(DIAGNOSES_ICD.query(f'SUBJECT_ID == {subject_id} & HADM_ID == {hadm_id}')['ICD9_CODE'].values)
    total = 0
    for key, value in coef_dict.items():
        disease_code_set = set(elixhauser_ICD_CODE[elixhauser_ICD_CODE['elixhauser instructions'] == key]['ICD9_CODE'].values)
        if len(icd_code_set.intersection(disease_code_set)) > 0:
            total += value
    return total
    
dataset['elixhauser'] = dataset.apply(get_elixhauser, axis=1, args=(elixhauser_ICD_CODE,))

# Split time

In [14]:
dataset.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])
dataset['Age'] = np.nan

def split_time(hour, dataset):
    hour_period = f'0{hour}:00:00'
    index = 0
    split = pd.DataFrame(columns = dataset.columns)
    for i in tqdm(dataset.index):
        intime, outtime = dataset['ADMITTIME'].loc[i], dataset['DISCHTIME'].loc[i]
        if dataset['ADMITTIME'].isnull()[i] or dataset['ADMITTIME'].isnull()[i]:
            continue
        while outtime - intime > pd.Timedelta(hour_period):
            #['SUBJECT_ID', 'HADM_ID', 'DOB', 'Gender', 'DEATHTIME', 'ADMITTIME', 'DISCHTIME', 're_admission', 'elixhauser', 'Age']
            split.loc[index] = [
                dataset['SUBJECT_ID'].loc[i],
                dataset['HADM_ID'].loc[i],
                dataset['DOB'].loc[i],
                dataset['Gender'].loc[i],
                dataset['DEATHTIME'].loc[i],
                intime,
                intime + pd.Timedelta(hour_period),
                dataset['re_admission'].loc[i],
                dataset['elixhauser'].loc[i],
                (intime.to_pydatetime() - dataset['DOB'].loc[i].to_pydatetime()).days / 365
            ]
            intime += pd.Timedelta(hour_period)
            index += 1

        split.loc[index] = [
            dataset['SUBJECT_ID'].loc[i],
            dataset['HADM_ID'].loc[i],
            dataset['DOB'].loc[i],
            dataset['Gender'].loc[i],
            dataset['DEATHTIME'].loc[i],
            intime,
            outtime,
            dataset['re_admission'].loc[i],
            dataset['elixhauser'].loc[i],
            (intime.to_pydatetime() - dataset['DOB'].loc[i].to_pydatetime()).days / 365
        ]

    split.drop('DOB', axis=1, inplace=True)
    split = split[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'Gender', 'Age', 'DEATHTIME', 're_admission', 'elixhauser']]
    split.columns = ['SUBJECT_ID', 'HADM_ID', 'STARTTIME', 'ENDTIME', 'Gender', 'Age', 'DEATHTIME', 're_admission', 'elixhauser']
    split.to_csv(temporal_path + f'dataset_split_{hour}_hour.csv', index=False)

    
processes = []
for i in range(1, 11):
    processes.append(Process(target=split_time, args=(i, dataset)))
for i in range(1, 11):
    processes[i].start()
for i in range(1, 11):
    processes[i].join()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5406/5406 [56:03<00:00,  1.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5406/5406 [1:17:19<00:00,  1.17it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5406/5406 [1:56:23<00:00,  1.29s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5406/5406 [3:18:30<00:00,  2.20s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5406/5406 [6:59:0