In [87]:
import pandas as pd
import numpy as np
import pickle
from src.KEMCE.dataset.data_labelling import LabelsForData

In [88]:
dx_file = 'data/diagnosis.csv'
pats_file = 'data/patient.csv'

sigle_dx_file = 'ccs/ccs_single_dx_tool_2015.csv'
multi_dx_file = 'ccs/ccs_multi_dx_tool_2015.csv'
label4data = LabelsForData(multi_dx_file, sigle_dx_file)

In [4]:
dxes = pd.read_csv(dx_file, header=0)
pats = pd.read_csv(pats_file, header=0)

In [9]:
pats.uniquepid.unique().shape

(139367,)

In [26]:
# unique patient:count
pat_vc = pats.uniquepid.value_counts()

In [27]:
# patients whose admission number is at least 2
pat_two_plus = pat_vc[pat_vc >1].index.tolist()

In [29]:
len(pat_two_plus)

38483

In [44]:
pid_adm_map = {}
for pid in pat_two_plus:
    pats_adm = pats[pats.uniquepid==pid]
    sorted_adms = pats_adm.sort_values(by=['hospitaldischargeyear', 'hospitaladmitoffset'], 
                                       ascending=[True, False])['patientunitstayid'].tolist()
    pid_adm_map[pid] = sorted_adms

In [49]:
#save pid admission map
pickle.dump(pid_adm_map, open('outputs/eICU/pid_adm.map', 'wb'), -1)

In [51]:
# filter null in icc9code field
dxes = dxes[dxes.icd9code.notnull()]

In [90]:
# Building Building strSeqs
seqs = []
for pid, adms in pid_adm_map.items():
    seq = []
    for adm in adms:
        code_list = []
        diags = dxes[dxes.patientunitstayid == adm]
        for index, row in diags.iterrows():
            codes = row.icd9code.split(',')
            if len(codes) == 2:
                # if the first letter is digit, it is icd9 code
                if codes[0][0].isdigit():
                    code_list.append(codes[0].replace('.', ''))
                if codes[1][0].isdigit():
                    code_list.append(codes[0].replace('.', ''))
            else:
                if codes[0][0].isdigit():
                    code_list.append(codes[0].replace('.', ''))
        if len(code_list) > 0:
            seq.append(code_list)
    if len(seq) > 1:
        seqs.append(seq)

In [92]:
# Building Building new strSeqs, which filters the admission with only one diagnosis code
new_seqs = []
for seq in seqs:
    new_seq = []
    for adm in seq:
        if len(adm) == 1:
            continue
        else:
            code_set = set(adm)
            if len(code_set) == 1:
                continue
            else:
                new_seq.append(list(code_set))
    if len(new_seq) > 1:
        new_seqs.append(new_seq)

In [93]:
len(new_seqs)

16180

In [94]:
max_seq_len = 0
max_adm_len = 0
new_seqs_str = []
adm_dx_ccs = []
adm_dx_ccs_cat1 = []

for seq in new_seqs:
    seq_ls = []
    dx_ccs_ls = []
    dx_ccs_cat1_ls = []
    if max_seq_len < len(seq):
        max_seq_len = len(seq)
    for adm in seq:
        new_adm = []
        dx_ccs = []
        dx_ccs_cat1 = []
        if max_adm_len < len(adm):
            max_adm_len = len(adm)
        for dx in adm:            
            dxStr = 'D_' + dx
            dxStr_ccs_single = 'D_' + label4data.code2single_dx[dx]
            dxStr_ccs_cat1 = 'D_' + label4data.code2first_level_dx[dx]
            new_adm.append(dxStr)
            dx_ccs.append(dxStr_ccs_single)
            dx_ccs_cat1.append(dxStr_ccs_cat1)
        seq_ls.append(new_adm)
        dx_ccs_ls.append(dx_ccs)
        dx_ccs_cat1_ls.append(dx_ccs_cat1)
    new_seqs_str.append(seq_ls)
    adm_dx_ccs.append(dx_ccs_ls)
    adm_dx_ccs_cat1.append(dx_ccs_cat1_ls)

In [97]:
max_seq_len, max_adm_len, len(new_seqs_str), len(adm_dx_ccs), len(adm_dx_ccs_cat1)

(17, 57, 16180, 16180, 16180)

In [105]:
pickle.dump(new_seqs_str, open('outputs/eICU/eicu.seqs', 'wb'), -1)

In [104]:
adm_dx_ccs_cat1[6], adm_dx_ccs[6]

([['D_7', 'D_3'], ['D_3', 'D_3'], ['D_3', 'D_3']],
 [['D_102', 'D_50'], ['D_55', 'D_50'], ['D_50', 'D_48']])

In [107]:
print('Converting strSeqs to intSeqs, and making types for ccs single-level code')
dict_ccs = {}
newSeqs_ccs = []
for patient in adm_dx_ccs:
    newPatient = []
    for visit in patient:
        newVisit = []
        for code in set(visit):
            if code in dict_ccs:
                newVisit.append(dict_ccs[code])
            else:
                dict_ccs[code] = len(dict_ccs)
                newVisit.append(dict_ccs[code])
        newPatient.append(newVisit)
    newSeqs_ccs.append(newPatient)

Converting strSeqs to intSeqs, and making types for ccs single-level code


In [108]:
len(dict_ccs), len(newSeqs_ccs)

(167, 16180)

In [109]:
print('Converting strSeqs to intSeqs, and making types for ccs multi-level first level code')
dict_ccs_cat1 = {}
newSeqs_ccs_cat1 = []
for patient in adm_dx_ccs_cat1:
    newPatient = []
    for visit in patient:
        newVisit = []
        for code in set(visit):
            if code in dict_ccs_cat1:
                newVisit.append(dict_ccs_cat1[code])
            else:
                dict_ccs_cat1[code] = len(dict_ccs_cat1)
                newVisit.append(dict_ccs_cat1[code])
        newPatient.append(newVisit)
    newSeqs_ccs_cat1.append(newPatient)

Converting strSeqs to intSeqs, and making types for ccs multi-level first level code


In [110]:
len(dict_ccs_cat1), len(newSeqs_ccs_cat1)

(18, 16180)

In [111]:
mimic_seqs = pickle.load(open('outputs/kemce/data/seq_prediction/mimic.inputs_all.seqs', 'rb'))

In [113]:
len(mimic_seqs), mimic_seqs[0]

(7499,
 [['D_41401',
   'D_4111',
   'D_4241',
   'D_V4582',
   'D_2724',
   'D_4019',
   'D_60000',
   'D_3899']])

In [123]:
print('Converting seqs to model inputs')
inputs_all = []
labels_ccs = []
labels_visit_cat1 = []
vocab_set = {}
for i, seq in enumerate(new_seqs_str):

    last_seqs = seq
    last_seq_ccs = newSeqs_ccs[i]
    last_seq_ccs_cat1 = newSeqs_ccs_cat1[i]

    valid_seq = last_seqs[:-1]
    label_ccs = last_seq_ccs[-1]

    labels_visit_cat1.append(last_seq_ccs_cat1[:-1])
    inputs_all.append(valid_seq)
    labels_ccs.append(label_ccs)

    for visit in valid_seq:
        for code in visit:
#             print(code)
            if code in vocab_set:
                vocab_set[code] += 1
            else:
                vocab_set[code] = 1

Converting seqs to model inputs


In [124]:
out_file = 'outputs/eICU/seq_prediction/eicu'

sorted_vocab = {k: v for k, v in sorted(vocab_set.items(), key=lambda item: item[1], reverse=True)}
pickle.dump(inputs_all, open(out_file + '.inputs_all.seqs', 'wb'), -1)
pickle.dump(labels_ccs, open(out_file + '.labels_ccs.label', 'wb'), -1)
pickle.dump(labels_visit_cat1, open(out_file + '.labels_visit_cat1.label', 'wb'), -1)
pickle.dump(dict_ccs, open(out_file + '.ccs_single_level.dict', 'wb'), -1)
pickle.dump(dict_ccs_cat1, open(out_file + '.ccs_cat1.dict', 'wb'), -1)
outfd = open(out_file + '.vocab.txt', 'w')
for k, v in sorted_vocab.items():
    outfd.write(k + '\n')
outfd.close()

max_seq_len = 0
max_adm_len = 0
for seq in new_seqs:
    if max_seq_len < len(seq):
        max_seq_len = len(seq)
    for adm in seq:
        if max_adm_len < len(adm):
            max_adm_len = len(adm)
print(max_adm_len, max_seq_len, len(dict_ccs), len(inputs_all), len(sorted_vocab), len(dict_ccs_cat1))

57 17 167 16180 725 18


In [149]:
treeFile = 'outputs/gram/data/eicu/eicu_new.seqs'
tree = pickle.load(open(treeFile, 'rb'))

In [162]:
treeFile = 'outputs/gram/data/eicu/eicu_new.seqs'
tree = pickle.load(open(treeFile, 'rb'))
seq_lens = [len(seq) for seq in tree]    
max_seq_len = max(seq_lens)
min_seq_len = min(seq_lens)

In [163]:
max_seq_len, min_seq_len, len(tree)

(17, 2, 16180)

In [158]:
max(seq_lens)

17

In [139]:
treeFile = 'outputs/eICU/seq_prediction/eicu.level2.pk'
tree_kemce = pickle.load(open(treeFile, 'rb'))

In [140]:
tree_kemce

OrderedDict([(720, [720, 1110, 1109]),
             (721, [721, 1110, 1109]),
             (722, [722, 1110, 1109]),
             (723, [723, 1110, 1109]),
             (724, [724, 1110, 1109]),
             (725, [725, 1110, 1109])])