In [None]:
import pandas as pd
import os

In [None]:
DATA_DIR = '/home/seewon/mimic3-benchmarks/data/mimiciv/3.1'
TASK_DIR = '/home/seewon/mimic3-benchmarks/data'
LABEL_COLS = ['Acute and unspecified renal failure', 'Cardiac dysrhythmias']

In [None]:
ADM = ['subject_id', 'hadm_id', 'edregtime', 'edouttime', 'admittime', 'dischtime', 'deathtime',]
PAT = ['subject_id', 'anchor_year', 'dod']
PROC_ICD = ['subject_id', 'hadm_id', 'seq_num', 'chartdate', 'icd_code', 'icd_version']
PHARM = ['subject_id', 'hadm_id', 'poe_id', 'starttime', 'stoptime', 'medication']
EMAR = ['subject_id', 'hadm_id', 'emar_id', 'emar_seq', 'poe_id', 'charttime', 'medication']
LAB = ['labevent_id', 'subject_id', 'hadm_id', 'specimen_id', 'itemid',
       'charttime',  'value', 'valuenum', 'ref_range_lower', 'ref_range_upper', 'flag']

In [None]:
STAY = ['subject_id', 'hadm_id', 'stay_id', 'intime', 'outtime']
CHART = ['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'value', 'valuenum', 'warning']
INPUT = ['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'itemid', 'amount',  
         'rate', 'patientweight', 'totalamount', 'originalamount', 'originalrate']
OUTPUT = ['subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'value']
PROC = ['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 
            'itemid', 'value', 'patientweight', 'originalamount', 'originalrate']

In [None]:
CHUNK_SIZE = 1_000_000

In [None]:
def ids():
    label_df3 = pd.read_csv(os.path.join(TASK_DIR, 'in-hospital-mortality', 'test/listfile.csv'), usecols=['stay'])
    label_df4 = pd.read_csv(os.path.join(TASK_DIR, 'in-hospital-mortality', 'train/listfile.csv'), usecols=['stay'])

    label_ids = []
    label_ids += list(pd.to_numeric(label_df3['stay'].apply(lambda x: x.split('_', 1)[0])).unique())
    label_ids += list(pd.to_numeric(label_df4['stay'].apply(lambda x: x.split('_', 1)[0])).unique())
    
    label_ids = list(set(label_ids))
    return label_ids


In [None]:
def get_csv(folder, fname, cols, label_ids):
    print(fname)
    filtered_chunks = []

    for chunk in pd.read_csv(os.path.join(DATA_DIR, folder, f'{fname}.csv'), usecols=cols, chunksize=CHUNK_SIZE):
        filtered_chunk = chunk[chunk['subject_id'].isin(label_ids)]
        filtered_chunks.append(filtered_chunk)

    df = pd.concat(filtered_chunks, ignore_index=True)
    return df

In [None]:
all_ids = ids()
for i in range(7):
    print(f"{i}")
    label_ids = pd.read_parquet(f'data/raw_data/visits_{i}.parquet')['subject_id']

    folders = ['meta']*7
    fnames = ['emar', 'labevents',  'pharmacy', 
              'chartevents', 'inputevents', 'outputevents', 'procedureevents']
    # 'admissions', 'patients','icustays',
    cols = [EMAR, LAB, PHARM, CHART, INPUT, OUTPUT, PROC]

    for folder, fname, col in zip(folders, fnames, cols):
        df = get_csv(folder, fname, col, label_ids)
        df.to_parquet(f"data/raw_data/{fname}_{i}.parquet", index=False)