# **Data Preprocessing for Longitudinal EHR Data**

This notebook preprocesses [MIMIC-III](https://physionet.org/content/mimiciii/1.4/) data for two tasks: diagnosis prediction (multi-label binary) and mortality prediction (binary). Also, this notebook can generate and prepare [ICD-9](https://www.cdc.gov/nchs/icd/icd9cm.htm) hierarchy of existing diagnosis codes in MIMIC-III as well as patient-code adjacency matrix, code-code adjacency matrix, and code map which is a dictionary of diagnosis codes.

This notebook prepares input into two formats: one is binary vector, the other one is set vector.

## **Package Setup**

In [None]:
import os
import pickle as pickle
import numpy as np
from datetime import datetime
import pandas as pd
import scipy.sparse as sps
import torch
from copy import deepcopy
import torch.nn as nn
import torch.nn.init as init
from torch.nn import functional as F
from collections import OrderedDict
import torch.utils.data as data
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import random
import warnings
warnings.filterwarnings("ignore")

## **Data Preprocessing Version 2**

In [None]:
def parse_admission(path) -> dict:
    print('parsing ADMISSIONS.csv ...')
    admission_path = os.path.join(path, 'ADMISSIONS.csv')
    admissions = pd.read_csv(
        admission_path,
        usecols=['SUBJECT_ID', 'HADM_ID', 'ADMITTIME'],
        converters={ 'SUBJECT_ID': np.int, 'HADM_ID': np.int, 'ADMITTIME': np.str }
    )
    all_patients = dict()
    for i, row in admissions.iterrows():
        pid = row['SUBJECT_ID']
        admission_id = row['HADM_ID']
        admission_time = datetime.strptime(row['ADMITTIME'], '%Y-%m-%d %H:%M:%S')
        if pid not in all_patients:
            all_patients[pid] = []
        admission = all_patients[pid]
        admission.append({
            'admission_id': admission_id,
            'admission_time': admission_time
        })

    patient_admission = dict()
    for pid, admissions in all_patients.items():
        if len(admissions) > 1:
            patient_admission[pid] = sorted(admissions, key=lambda admission: admission['admission_time'])

    return patient_admission

In [None]:
def parse_diagnoses(path, patient_admission: dict) -> dict:
    print('parsing DIAGNOSES_ICD.csv ...')
    diagnoses_path = os.path.join(path, 'DIAGNOSES_ICD.csv')
    diagnoses = pd.read_csv(
        diagnoses_path,
        usecols=['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE'],
        converters={ 'SUBJECT_ID': np.int, 'HADM_ID': np.int, 'ICD9_CODE': np.str }
    )

    def to_standard_icd9(code: str):
        split_pos = 4 if code.startswith('E') else 3
        icd9_code = code[:split_pos] + '.' + code[split_pos:] if len(code) > split_pos else code
        return icd9_code

    admission_codes = dict()
    for i, row in diagnoses.iterrows():
        pid = row['SUBJECT_ID']
        if pid in patient_admission:
            admission_id = row['HADM_ID']
            code = row['ICD9_CODE']
            if code == '':
                continue
            code = to_standard_icd9(code)
            if admission_id not in admission_codes:
                codes = []
                admission_codes[admission_id] = codes
            else:
                codes = admission_codes[admission_id]
            codes.append(code)

    return admission_codes

In [None]:
def calibrate_patient_by_admission(patient_admission: dict, admission_codes: dict):
    print('calibrating patients by admission ...')
    del_pids = []
    for pid, admissions in patient_admission.items():
        for admission in admissions:
            if admission['admission_id'] not in admission_codes:
                break
        else:
            continue
        del_pids.append(pid)
    for pid in del_pids:
        admissions = patient_admission[pid]
        for admission in admissions:
            if admission['admission_id'] in admission_codes:
                del admission_codes[admission['admission_id']]
        del patient_admission[pid]

In [None]:
raw_path = '/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/'
patient_admission = parse_admission(raw_path)
admission_codes = parse_diagnoses(raw_path, patient_admission)
calibrate_patient_by_admission(patient_admission, admission_codes)
print('There are %d valid patients' % len(patient_admission))

parsing ADMISSIONS.csv ...
parsing DIAGNOSES_ICD.csv ...
calibrating patients by admission ...
There are 7493 valid patients


In [None]:
max_admission_num = 0
for pid, admissions in patient_admission.items():
    if len(admissions) > max_admission_num:
        max_admission_num = len(admissions)
max_code_num_in_a_visit = 0
for admission_id, codes in admission_codes.items():
    if len(codes) > max_code_num_in_a_visit:
        max_code_num_in_a_visit = len(codes)

In [None]:
def encode_code(admission_codes: dict) -> (dict, dict):
    print('encoding code ...')
    code_map = dict()
    for i, (admission_id, codes) in enumerate(admission_codes.items()):
        for code in codes:
            if code not in code_map:
                code_map[code] = len(code_map) + 1

    admission_codes_encoded = {
        admission_id: [code_map[code] for code in codes]
        for admission_id, codes in admission_codes.items()
    }
    return admission_codes_encoded, code_map

In [None]:
def encode_time_duration(patient_admission: dict) -> dict:
    print('encoding time duration ...')
    patient_time_duration_encoded = dict()
    for pid, admissions in patient_admission.items():
        duration = [0]
        for i in range(1, len(admissions)):
            days = (admissions[i]['admission_time'] - admissions[i - 1]['admission_time']).days
            duration.append(days)
        patient_time_duration_encoded[pid] = duration
    return patient_time_duration_encoded

In [None]:
def split_patients(patient_admission: dict, admission_codes: dict, code_map: dict, seed=6669) -> (np.ndarray, np.ndarray):
    print('splitting train, valid, and test pids')
    np.random.seed(seed)
    common_pids = set()
    for i, code in enumerate(code_map):
        print('\r\t%.2f%%' % ((i + 1) * 100 / len(code_map)), end='')
        for pid, admissions in patient_admission.items():
            for admission in admissions:
                codes = admission_codes[admission['admission_id']]
                if code in codes:
                    common_pids.add(pid)
                    break
            else:
                continue
            break
    print('\r\t100%')
    max_admission_num = 0
    pid_max_admission_num = 0
    for pid, admissions in patient_admission.items():
        if len(admissions) > max_admission_num:
            max_admission_num = len(admissions)
            pid_max_admission_num = pid
    common_pids.add(pid_max_admission_num)
    remaining_pids = np.array(list(set(patient_admission.keys()).difference(common_pids)))
    np.random.shuffle(remaining_pids)

    train_num = 7000
    valid_num = 0
    train_pids = np.array(list(common_pids.union(set(remaining_pids[:(train_num - len(common_pids))].tolist()))))
    # valid_pids = remaining_pids[(train_num - len(common_pids)):(train_num + valid_num - len(common_pids))]
    test_pids = remaining_pids[(train_num + valid_num - len(common_pids)):]
    return train_pids, test_pids

In [None]:
admission_codes_encoded, code_map = encode_code(admission_codes)
patient_time_duration_encoded = encode_time_duration(patient_admission)

code_num = len(code_map)

train_pids, test_pids = split_patients(
    patient_admission=patient_admission,
    admission_codes=admission_codes,
    code_map=code_map
)

encoding code ...
encoding time duration ...
splitting train, valid, and test pids
	100%


In [None]:
def build_code_xy(pids: np.ndarray,
                  patient_admission: dict,
                  admission_codes_encoded: dict,
                  max_admission_num: int,
                  code_num: int,
                  max_code_num_in_a_visit: int) -> (np.ndarray, np.ndarray, np.ndarray):
    print('building train/valid/test codes features and labels ...')
    n = len(pids)
    x = np.zeros((n, max_admission_num, max_code_num_in_a_visit), dtype=int)
    y = np.zeros((n, code_num), dtype=int)
    lens = np.zeros((n, ), dtype=int)
    for i, pid in enumerate(pids):
        print('\r\t%d / %d' % (i + 1, len(pids)), end='')
        admissions = patient_admission[pid]
        for k, admission in enumerate(admissions[:-1]):
            codes = admission_codes_encoded[admission['admission_id']]
            x[i][k][:len(codes)] = codes
        codes = np.array(admission_codes_encoded[admissions[-1]['admission_id']]) - 1
        y[i][codes] = 1
        lens[i] = len(admissions) - 1
    print('\r\t%d / %d' % (len(pids), len(pids)))
    return x, y, lens

In [None]:
def build_time_duration_xy(pids: np.ndarray,
                           patient_time_duration_encoded: dict,
                           max_admission_num: int) -> (np.ndarray, np.ndarray):
    print('building train/valid/test time duration features and labels ...')
    n = len(pids)
    x = np.zeros((n, max_admission_num))
    y = np.zeros((n, ))
    for i, pid in enumerate(pids):
        print('\r\t%d / %d' % (i + 1, len(pids)), end='')
        duration = patient_time_duration_encoded[pid]
        x[i][:len(duration) - 1] = duration[:-1]
        y[i] = duration[-1]
    print('\r\t%d / %d' % (len(pids), len(pids)))
    return x, y

In [None]:
train_codes_x, train_codes_y, train_visit_lens = build_code_xy(train_pids, patient_admission, admission_codes_encoded, max_admission_num, code_num, max_code_num_in_a_visit)
test_codes_x, test_codes_y, test_visit_lens = build_code_xy(test_pids, patient_admission, admission_codes_encoded, max_admission_num, code_num, max_code_num_in_a_visit)

building train/valid/test codes features and labels ...
	7000 / 7000
building train/valid/test codes features and labels ...
	493 / 493


In [None]:
def parse_icd9_range(range_: str) -> (str, str, int, int):
    ranges = range_.lstrip().split('-')
    if ranges[0][0] == 'V':
        prefix = 'V'
        format_ = '%02d'
        start, end = int(ranges[0][1:]), int(ranges[1][1:])
    elif ranges[0][0] == 'E':
        prefix = 'E'
        format_ = '%03d'
        start, end = int(ranges[0][1:]), int(ranges[1][1:])
    else:
        prefix = ''
        format_ = '%03d'
        if len(ranges) == 1:
            start = int(ranges[0])
            end = start + 1
        else:
            start, end = int(ranges[0]), int(ranges[1])
    return prefix, format_, start, end

In [None]:
def generate_code_levels(path, code_map: dict) -> np.ndarray:
    print('generating code levels ...')
    three_level_code_set = set(code.split('.')[0] for code in code_map)
    icd9_path = os.path.join(path, 'icd9.txt')
    icd9_range = list(open(icd9_path, 'r', encoding='utf-8').readlines())
    three_level_dict = dict()
    level1, level2, level3 = (1, 1, 1)
    level1_can_add = False
    for range_ in icd9_range:
        range_ = range_.rstrip()
        if range_[0] == ' ':
            prefix, format_, start, end = parse_icd9_range(range_)
            level2_cannot_add = True
            for i in range(start, end + 1):
                code = prefix + format_ % i
                if code in three_level_code_set:
                    three_level_dict[code] = [level1, level2, level3]
                    level3 += 1
                    level1_can_add = True
                    level2_cannot_add = False
            if not level2_cannot_add:
                level2 += 1
        else:
            if level1_can_add:
                level1 += 1
                level1_can_add = False

    level4 = 1
    code_level = dict()
    for code in code_map:
        three_level_code = code.split('.')[0]
        if three_level_code in three_level_dict:
            three_level = three_level_dict[three_level_code]
            code_level[code] = three_level + [level4]
            level4 += 1
        else:
            print(three_level_code)
            code_level[code] = [0, 0, 0, 0]

    code_level_matrix = np.zeros((len(code_map) + 1, 4), dtype=int)
    for code, cid in code_map.items():
        code_level_matrix[cid] = code_level[code]

    return code_level_matrix

In [None]:
def generate_patient_code_adjacent(code_x: np.ndarray, code_num: int) -> np.ndarray:
    print('generating patient code adjacent matrix ...')
    result = np.zeros((len(code_x), code_num + 1), dtype=int)
    for i, codes in enumerate(code_x):
        adj_codes = codes[codes > 0]
        result[i][adj_codes] = 1
    return result

In [None]:
def generate_code_code_adjacent(code_num: int, code_level_matrix: np.ndarray) -> np.ndarray:
    print('generating code code adjacent matrix ...')
    n = code_num + 1
    result = np.zeros((n, n), dtype=int)
    for i in range(1, n):
        print('\r\t%d / %d' % (i, n), end='')
        for j in range(1, n):
            if i != j:
                level_i = code_level_matrix[i]
                level_j = code_level_matrix[j]
                same_level = 4
                while same_level > 0:
                    level = same_level - 1
                    if level_i[level] == level_j[level]:
                        break
                    same_level -= 1
                result[i, j] = same_level + 1
    print('\r\t%d / %d' % (n, n))
    return result

In [None]:
def co_occur(pids: np.ndarray,
             patient_admission: dict,
             admission_codes_encoded: dict,
             code_num: int) -> (np.ndarray, np.ndarray, np.ndarray):
    print('calculating co-occurrence ...')
    x = np.zeros((code_num + 1, code_num + 1), dtype=float)
    for i, pid in enumerate(pids):
        print('\r\t%d / %d' % (i + 1, len(pids)), end='')
        admissions = patient_admission[pid]
        for k, admission in enumerate(admissions[:-1]):
            codes = admission_codes_encoded[admission['admission_id']]
            for m in range(len(codes) - 1):
                for n in range(m + 1, len(codes)):
                    c_i, c_j = codes[m], codes[n]
                    x[c_i, c_j] = 1
                    x[c_j, c_i] = 1
    print('\r\t%d / %d' % (len(pids), len(pids)))
    return x

In [None]:
data_path = raw_path
code_levels = generate_code_levels(data_path, code_map)
patient_code_adj = generate_patient_code_adjacent(code_x=train_codes_x, code_num=code_num)
code_code_adj_t = generate_code_code_adjacent(code_level_matrix=code_levels, code_num=code_num)
co_occur_matrix = co_occur(train_pids, patient_admission, admission_codes_encoded, code_num)
code_code_adj = code_code_adj_t * co_occur_matrix

generating code levels ...
generating patient code adjacent matrix ...
generating code code adjacent matrix ...
	4881 / 4881
calculating co-occurrence ...
	7000 / 7000


In [None]:
l1 = len(train_pids)
train_patient_ids = np.arange(0, l1)
l2 = l1 + 0
l3 = l2 + len(test_pids)
test_patient_ids = np.arange(l2, l3)
pid_map = dict()
for i, pid in enumerate(train_pids):
    pid_map[pid] = train_patient_ids[i]
for i, pid in enumerate(test_pids):
    pid_map[pid] = test_patient_ids[i]

In [None]:
code_levels = code_levels[1:][:]                      # code_levels --> Remove first row
patient_code_adj = np.delete(patient_code_adj, 0, 1)  # patient_code_adj --> Remove first column
code_code_adj = np.delete(code_code_adj[1:][:], 0, 1) # code_code_adj --> Remove first row & column

In [None]:
patients = pd.read_csv("/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/PATIENTS.csv")
train_mort = np.zeros(len(train_pids))
for i in range(len(train_pids)):
  train_mort[i] = int(patients.loc[patients["SUBJECT_ID"] == train_pids[i]]["EXPIRE_FLAG"])
test_mort = np.zeros(len(test_pids))
for i in range(len(test_pids)):
  test_mort[i] = int(patients.loc[patients["SUBJECT_ID"] == test_pids[i]]["EXPIRE_FLAG"])

In [None]:
binary_train_codes_x = []
for i in range(len(train_pids)):
  one_patient = np.zeros((train_visit_lens[i], code_num))
  for ii in range(train_visit_lens[i]):
    temp = train_codes_x[i][ii]
    temp = temp[temp > 0] - 1
    one_patient[ii][temp] = 1
  binary_train_codes_x.append(one_patient)

binary_test_codes_x = []
for i in range(len(test_pids)):
  one_patient = np.zeros((test_visit_lens[i], code_num))
  for ii in range(test_visit_lens[i]):
    temp = test_codes_x[i][ii]
    temp = temp[temp > 0] - 1
    one_patient[ii][temp] = 1
  binary_test_codes_x.append(one_patient)

In [None]:
with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Binary_Data_Format/binary_train_codes_x.pkl', 'wb') as f0:
  pickle.dump(binary_train_codes_x, f0)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Binary_Data_Format/binary_test_codes_x.pkl', 'wb') as f1:
  pickle.dump(binary_test_codes_x, f1)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Binary_Data_Format/train_codes_y.npy', 'wb') as f2:
  np.save(f2, train_codes_y)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Binary_Data_Format/train_visit_lens.npy', 'wb') as f3:
  np.save(f3, train_visit_lens)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Binary_Data_Format/train_mort.npy', 'wb') as f4:
  np.save(f4, train_mort)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Binary_Data_Format/test_codes_y.npy', 'wb') as f5:
  np.save(f5, test_codes_y)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Binary_Data_Format/test_visit_lens.npy', 'wb') as f6:
  np.save(f6, test_visit_lens)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Binary_Data_Format/test_mort.npy', 'wb') as f7:
  np.save(f7, test_mort)

In [None]:
with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Nonbinary_Data_Format/train_codes_x.npy', 'wb') as f8:
  np.save(f8, train_codes_x)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Nonbinary_Data_Format/test_codes_x.npy', 'wb') as f9:
  np.save(f9, test_codes_x)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Nonbinary_Data_Format/train_codes_y.npy', 'wb') as fa:
  np.save(fa, train_codes_y)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Nonbinary_Data_Format/train_visit_lens.npy', 'wb') as fb:
  np.save(fb, train_visit_lens)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Nonbinary_Data_Format/train_mort.npy', 'wb') as fc:
  np.save(fc, train_mort)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Nonbinary_Data_Format/test_codes_y.npy', 'wb') as fd:
  np.save(fd, test_codes_y)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Nonbinary_Data_Format/test_visit_lens.npy', 'wb') as fe:
  np.save(fe, test_visit_lens)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/Nonbinary_Data_Format/test_mort.npy', 'wb') as ff:
  np.save(ff, test_mort)

In [None]:
with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/code_related/code_levels.npy', 'wb') as f10:
  np.save(f10, code_levels)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/code_related/patient_code_adj.npy', 'wb') as f11:
  np.save(f11, patient_code_adj)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/code_related/code_code_adj.npy', 'wb') as f12:
  np.save(f12, code_code_adj)

with open('/content/drive/MyDrive/CEED/MIMIC_Data/III/June_Preprocessed/code_related/code_map.pkl', 'wb') as f13:
  pickle.dump(code_map, f13)