# Processing

In this step we generate sparse matrix, general information and ICD9 codes for each patient.

In [1]:
from __future__ import print_function

import psycopg2
import datetime
import sys
from operator import itemgetter, attrgetter, methodcaller
import numpy as np
import itertools
import os.path
import matplotlib.pyplot as plt
import math
from multiprocessing import Pool, cpu_count
import re
import traceback
import shutil
from collections import OrderedDict
from tqdm import tqdm_notebook as tqdm

from utils import getConnection
from utils import parseUnitsMap
from utils import parseNum
from utils import sparsify

# %matplotlib inline

In [28]:
num_workers = cpu_count() // 2

## Set unit conversion map

We manually set rules for unit conversion and store them in file './config/unitsmap.unit'. The file format is as following:

```
tablename:[name of table in database]
[itemid1],[target unit1],[unit11:ratio11],[unit12:ratio12],...,[unit1n:ratio1n]
...
[itemidn],[target unitn],[unitn1:ration1],[unitn2:ration2],...,[unitnn:rationn]
```

The ratio is set using the following rule: $\mathrm{unit1}*\mathrm{ratio1}=\mathrm{unit2}*\mathrm{ratio2}=...=\mathrm{unitn}*\mathrm{ration}$. For example, one row in the file could be: `227114,mg,mg:1,mcg:1000`. It means that $1\mathrm{mg}=1000\mathrm{mcg}$.

In [3]:
conn = getConnection()
UNITSMAP = parseUnitsMap()

# Set indices for chartevents table

We need to add hadm_id as indices to speed up the query. By default it is not added. Thanks for the help from Weijing Tang@UMich!

You might need to run `grant postgres to <your username>;` before building indices. https://stackoverflow.com/questions/28584640/postgresql-error-must-be-owner-of-relation-when-changing-a-owner-object/28586288

In [4]:
conn = getConnection()
cur = conn.cursor()
# add index to the whole chartevents
indicescomm = '''DROP INDEX IF EXISTS chartevents_idx02;
CREATE INDEX chartevents_idx02 ON mimiciii.chartevents (hadm_id);'''
cur.execute(indicescomm)
conn.commit()

In [5]:
_adm = np.load('res/admission_ids.npy', allow_pickle=True).tolist()
admission_ids = _adm['admission_ids']
admission_ids_txt = _adm['admission_ids_txt']

_adm_first = np.load('res/admission_first_ids.npy', allow_pickle=True).tolist()
admission_first_ids = _adm['admission_ids']

In [6]:
v = np.load('res/filtered_input.npy', allow_pickle=True).tolist()
valid_input = v['id']
valid_input_unit = v['unit']

v = np.load('res/filtered_output.npy', allow_pickle=True).tolist()
valid_output = v['id']

v = np.load('res/filtered_chart.npy', allow_pickle=True).tolist()
valid_chart = v['id']
valid_chart_unit = v['unit']

v = np.load('res/filtered_chart_num.npy', allow_pickle=True).tolist()
valid_chart_num = v['id']
valid_chart_num_unit = v['unit']

v = np.load('res/filtered_chart_cate.npy', allow_pickle=True).tolist()
valid_chart_cate = v['id']

v = np.load('res/filtered_chart_ratio.npy', allow_pickle=True).tolist()
valid_chart_ratio = v['id']

v = np.load('res/filtered_lab.npy', allow_pickle=True).tolist()
valid_lab = v['id']
valid_lab_unit = v['unit']

v = np.load('res/filtered_lab_num.npy', allow_pickle=True).tolist()
valid_lab_num = v['id']
valid_lab_num_unit = v['unit']

v = np.load('res/filtered_lab_cate.npy', allow_pickle=True).tolist()
valid_lab_cate = v['id']

v = np.load('res/filtered_lab_ratio.npy', allow_pickle=True).tolist()
valid_lab_ratio = v['id']

v = np.load('res/filtered_microbio.npy', allow_pickle=True).tolist()
valid_microbio = v['id']

v = np.load('res/filtered_prescript.npy', allow_pickle=True).tolist()
valid_prescript = v['id']
valid_prescript_unit = v['unit']

allids = valid_input+valid_output+valid_chart+valid_chart_num+valid_chart_cate+valid_chart_ratio+valid_chart_ratio+valid_lab+valid_lab_num+valid_lab_cate+valid_lab_ratio+valid_lab_ratio+valid_microbio+valid_prescript
print(len(allids), len(set(allids)))

23634 23533


## Create temporary tables for accelerating the query

In [7]:
# put valid ids into database
conn = getConnection()
cur = conn.cursor()
for itemidlist, itemidlistname in zip([valid_input, valid_output, valid_chart, valid_chart_num, valid_chart_cate, valid_chart_ratio, valid_lab, valid_lab_num, valid_lab_cate, valid_lab_ratio], 'valid_input, valid_output, valid_chart, valid_chart_num, valid_chart_cate, valid_chart_ratio, valid_lab, valid_lab_num, valid_lab_cate, valid_lab_ratio'.replace(' ', '').split(',')):
    sql = 'drop table if exists mengcztemp_itemids_{0}'.format(itemidlistname)
    cur.execute(sql)
    conn.commit()
    sql = 'create table if not exists mengcztemp_itemids_{0} (\
    itemid serial PRIMARY KEY \
    )'.format(itemidlistname)
    cur.execute(sql)
    conn.commit()
    for itemid in itemidlist:
        sql = 'insert into mengcztemp_itemids_{0} (itemid) values ({1})'.format(itemidlistname, itemid)
        cur.execute(sql)
    conn.commit()
    sql = 'select * from mengcztemp_itemids_{0} limit 100'.format(itemidlistname)
    cur.execute(sql)
    res = cur.fetchall()
#     print(res)

In [8]:
cur = conn.cursor()
sql = 'drop table if exists mengcztemp_itemids_{0}'.format('valid_prescript')
cur.execute(sql)
conn.commit()
sql = 'create table if not exists mengcztemp_itemids_{0} (\
    itemid varchar(255) PRIMARY KEY \
    )'.format('valid_prescript')
cur.execute(sql)
conn.commit()
for itemid in valid_prescript:
    sql = 'insert into mengcztemp_itemids_{0} (itemid) values (\'{1}\')'.format('valid_prescript', itemid)
    cur.execute(sql)
conn.commit()
sql = 'select * from mengcztemp_itemids_{0} limit 100'.format('valid_prescript')
cur.execute(sql)
res = cur.fetchall()
print(res, len(res), len(valid_prescript))

[('DXIG40I',), ('NORESTFE',), ('ARIP10',), ('BACI5I',), ('CALCITVITD',), ('D12.5W250I',), ('NEOLOPE2L',), ('METR500I',), ('LACO50L',), ('OLME20',), ('SILD25',), ('MYCO250IND',), ('ROPI5',), ('NEOSYRD12.5W',), ('VALDIND',), ('GLAR100I',), ('GAST30L',), ('MIDA50I',), ('VANCOINV',), ('CORTIND',), ('RALT400IND',), ('VANC500L',), ('LANT750',), ('METO25',), ('BACTDS',), ('DIAZE5I',), ('ACET100D',), ('MORP20/30L',), ('MAGSUL20I',), ('LITH450SA',), ('DOXA1',), ('CORTOTIC',), ('ENOX60I',), ('PROP100IG',), ('FENT100P',), ('LEUC5',), ('VANC.000125L',), ('MYCO2C',), ('DISU250',), ('FLUC60L',), ('ACET800I',), ('SNLR260O',), ('CLAR500SYR',), ('DICL25',), ('NS20SYR',), ('HYDR1O',), ('CHLO2',), ('SORA200',), ('VITE100',), ('COMBPTCH',), ('NTG100PM',), ('OXYB5',), ('CYCLOBASE',), ('GOLY4L',), ('ISMO10',), ('MICO745C',), ('DRON400',), ('LATA5OS',), ('FOSA150IV',), ('DARU400',), ('LAMO100BRAND',), ('NIFR150',), ('FENT20I',), ('GLUC5XL',), ('ENOX120I',), ('TEMO100',), ('CISA10I',), ('HYDZ25',), ('FURO40',

In [9]:
print('len(valid_input) = '+ str(len(valid_input)))
print('len(valid_output) = '+ str(len(valid_output)))
print ('len(valid_chart) = '+ str(len(valid_chart)))
print('len(valid_chart_num) = '+ str(len(valid_chart_num)))
print('len(valid_chart_cate) = '+ str(len(valid_chart_cate)))
print('len(valid_chart_ratio) = '+ str(len(valid_chart_ratio)))
print('len(valid_lab) = '+ str(len(valid_lab)))
print('len(valid_lab_num) = '+ str(len(valid_lab_num)))
print('len(valid_lab_cate) = '+ str(len(valid_lab_cate)))
print('len(valid_lab_ratio) = '+ str(len(valid_lab_ratio)))
print('len(valid_microbio) = '+ str(len(valid_microbio)))
print('len(valid_prescript) = '+ str(len(valid_prescript)))
print('\nlen(allids) = '+ str(len(allids)))

len(valid_input) = 3210
len(valid_output) = 1155
len(valid_chart) = 2644
len(valid_chart_num) = 314
len(valid_chart_cate) = 3405
len(valid_chart_ratio) = 100
len(valid_lab) = 372
len(valid_lab_num) = 113
len(valid_lab_cate) = 224
len(valid_lab_ratio) = 1
len(valid_microbio) = 9154
len(valid_prescript) = 2841

len(allids) = 23634


In [10]:
# map itemids to [0..n] column
index = 0
map_itemid_index = {}
allitem = allids
allitem_unit = valid_input_unit + ['NOCHECK'] * len(valid_output) + valid_chart_unit + valid_chart_num_unit + ['NOCHECK'] * len(valid_chart_cate) + ['NOCHECK'] * 2 * len(valid_chart_ratio) + valid_lab_unit + valid_lab_num_unit+ ['NOCHECK']*len(valid_lab_cate)+ ['NOCHECK'] * 2 * len(valid_lab_ratio)+['NOCHECK']*len(valid_microbio) + valid_prescript_unit
for i in range(len(allitem_unit)):
    allitem_unit[i] = allitem_unit[i].replace(' ','').lower()
assert len(allitem) == len(allitem_unit)
for ai in allitem:
    if ai not in map_itemid_index.keys():
        map_itemid_index[ai] = [index]
    else:
        map_itemid_index[ai].append(index)
    index+=1
# print(map_itemid_index)
print(len(map_itemid_index))
np.save('res/map_itemid_index.npy', map_itemid_index)

23533


## Map strings in categorical features to integers and store them to a file

In [11]:
catedict = {}

for i in valid_chart_cate:
    cur = conn.cursor()
    cur.execute('SELECT distinct value FROM mimiciii.chartevents WHERE itemid = '+ str(i) +' and hadm_id in (select * from admission_ids)')
    distinctval = cur.fetchall()
    mapping = {};
    ct = 1;
    for d in distinctval:
        mapping[d[0]]= ct;
        ct+=1;
    catedict[i] = mapping
    print(i)
    
for i in valid_lab_cate:
    cur = conn.cursor()
    cur.execute('SELECT distinct value FROM mimiciii.labevents WHERE itemid = '+ str(i) +' and hadm_id in (select * from admission_ids)')
    distinctval = cur.fetchall()
    mapping = {};
    ct = 1;
    for d in distinctval:
        mapping[d[0]]= ct;
        ct+=1;
    catedict[i] = mapping
    print(i)
    
np.save('res/catedict.npy', catedict)
print('saved!')

2848
3565
6158
4349
4718
3370
4466
224890
8394
6122
4888
6967
599
3071
4321
6169
225099
1820
4875
224854
7477
5729
359
225003
228307
227837
1478
3367
1543
7003
1229
227677
6312
2150
7981
4650
224889
3666
4722
224692
8519
1116
224960
4388
5142
7697
7818
5796
2452
6960
6068
5605
5080
537
5099
2300
7214
224450
8372
8178
227782
4475
6680
325
223991
223947
1999
227834
1333
6782
4986
3124
5561
5048
3510
7300
7088
5329
4910
227683
3335
6309
228153
8399
227752
1389
1364
227783
655
224084
167
6163
224991
5656
478
8523
224988
458
4414
629
223782
1606
2761
4355
2693
227240
227929
4360
6210
3492
5347
224633
227474
2542
1963
5210
6118
228448
77
225743
140
4976
56
6790
3384
4291
8466
224101
228449
4449
5762
7934
224976
227963
224005
223840
228446
1910
224935
8320
977
4189
8433
376
227672
224001
515
1427
228303
6287
6407
8011
5610
4435
165
5838
4312
6247
570
1049
7780
7947
5770
7779
7327
609
228451
7397
224907
6337
8230
227853
5821
6031
1405
3458
7219
227643
2351
108
223902
7646
2362
1089
3034
3249
5

227517
8538
224978
7667
4990
8265
5781
228290
4676
227601
227667
5742
5947
665
5646
225240
227675
1439
5876
6212
8149
6239
3638
5650
5912
6192
6202
2316
399
227761
6535
227475
3354
7914
4604
1545
223979
225007
268
549
1512
8427
310
3179
4735
224997
225135
1369
3817
8509
223931
5213
8109
1490
5662
4721
227846
5214
2366
5609
224188
3640
8140
4562
5060
227115
8190
265
4272
379
3570
5293
6184
227848
168
228226
2616
2765
4681
238
3307
4445
8426
7890
4736
3426
1936
7259
6277
227631
7909
7861
3283
3600
225020
3274
4929
8120
3562
554
224019
4426
224456
8315
6513
223836
1980
5004
227612
135
1047
1821
2834
3010
7222
4790
8471
6025
8391
6593
3363
1019
1811
7582
8451
2474
6046
225045
160
640
6104
4902
7767
6006
1899
3465
225021
6957
5640
4225
223918
223926
288
223943
7213
1375
3579
7093
223949
5853
225072
5991
4816
5725
321
6039
1337
4669
4680
5500
139
2705
6562
8530
594
8103
381
7542
6306
927
466
5919
223784
250
4561
2567
5740
5435
224888
4532
5632
224452
226133
8513
582
6275
6681
7470
7547
22761

641
1844
3488
8137
227962
3306
6048
8080
2512
5434
5765
224786
4553
5206
224829
4590
2469
395
567
1066
6026
4587
6225
224082
460
8083
4951
393
224848
3248
8331
7319
5799
5478
2684
227762
4433
4737
7626
227969
350
8434
7280
6258
4817
4238
6419
8194
5119
2077
159
1111
6629
6005
2386
3344
5715
3660
1112
138
3540
5412
1948
574
224027
3327
6687
227956
4792
225986
1214
4391
7031
6072
224851
228398
227958
8393
200
1645
223923
7043
2852
2565
3537
223921
7422
2784
5394
6187
2976
8381
6253
225963
225013
6533
1822
3417
8273
49
1365
227959
5663
6227
1058
224903
8253
223825
7515
6088
6204
5811
327
4342
469
1053
2944
6094
4724
5976
5644
5461
4719
2433
6098
228217
5159
8010
423
224086
3592
225387
5245
224391
7406
4520
227588
5692
227662
3243
223987
227756
2786
223998
8193
5907
223996
3302
7602
8395
1466
224414
972
1326
228210
5672
4370
6116
225411
411
5289
4544
227590
4234
6075
227781
5167
7589
383
224015
227663
224433
224771
1119
3642
616
5200
453
1548
228164
3145
5603
4874
5836
7009
1420
8259
22681

In [12]:
catedict = np.load('res/catedict.npy', allow_pickle=True).tolist()
# print(catedict)

In [13]:
def convert_units(unitmap, src_unit, dst_unit, src_value, f):
    try:
        src_ratio = unitmap['umap'][src_unit]
        dst_ratio = unitmap['umap'][dst_unit]
    except:
        print('converterror: ', unitmap, src_unit, dst_unit, src_value, file=f)
        return None
    if src_ratio == 0:
        return None
    else:
        return float(src_value) / src_ratio * dst_ratio

## Processing inputevents

1. Discard records without starttime.
2. Discard records without amount.
3. If the itemid matches manual rules, convert it according to the rule; else only keep the main unit and discard all other records not having the main unit.

In [14]:
def processing_inputevents(aid, admittime, conn, f):
    cur = conn.cursor()
    sql = '''select starttime, itemid, amount, amountuom from mimiciii.inputevents_mv where amount>0 and hadm_id={0} and itemid in (select * from mengcztemp_itemids_valid_input)
union all
select charttime as starttime, itemid, amount, amountuom from mimiciii.inputevents_cv where amount>0 and hadm_id={0} and itemid in (select * from mengcztemp_itemids_valid_input)'''.format(aid)
    cur.execute(sql)
    inputevents = cur.fetchall()
    inputevents_wholedata = []
    for ie in inputevents:
        starttime, itemid, amount, amountuom = ie[0], ie[1], ie[2], ie[3]
        # discard values with no starttime
        if starttime is None:
            print('no starttime: ', ie, file=f)
            continue
        # discard values with no amount
        if amount is None:
            print('no amount: ', ie, file=f)
            continue
        # convert units...
        amountuom = amountuom.replace(' ', '').lower()
        unitmap = UNITSMAP['inputevents']
        mainunit = allitem_unit[map_itemid_index[itemid][0]]
        if itemid in unitmap.keys():
            dst_value = convert_units(unitmap[itemid], amountuom, mainunit, amount, f)
        else:
            if amountuom == mainunit:
                dst_value = float(amount)
            else:
                dst_value = None
        if dst_value is None:
            print('not convertible: ', ie, file=f)
            continue
        inputevents_wholedata.append(['ie', (starttime - admittime).total_seconds(), [starttime, itemid, dst_value, mainunit]])
    return inputevents_wholedata

# processing_inputevents(184834, datetime.datetime.now(), getConnection(), sys.stdout)

## Processing outputevents

We only need to discard records without starttime or value.

In [15]:
def processing_outputevents(aid, admittime, conn, f):
    cur = conn.cursor()
    cur.execute('SELECT charttime,itemid,value FROM mimiciii.outputevents WHERE hadm_id = '+str(aid)+' and itemid in (select * from mengcztemp_itemids_valid_output)')
    outputevents = cur.fetchall()
    wholedata = []
    for oe in outputevents : 
        #check date
        if(oe[0] == None):
            print('no starttime', oe, file=f)
            continue
        #discard none value
        if(oe[2] == None):
            print('no value', oe, file=f)
            continue
        # no need to check unit all is mL
        oe = list(oe)
        oe.append('ml')
        wholedata.append(['oe',(oe[0]-admittime).total_seconds(),oe])
    return wholedata

# processing_outputevents(184834, datetime.datetime.now(), getConnection(), f=sys.stdout)

## Processing chartevents/labevnets

1. Discard records without starttime or value/valueuom.
2. Process 4 kinds of chartevents/labevents features separately.
    1. valid numerical features(numerical features not needing parsing): only need to convert units.
    2. categorical features: only need to map strings to integers.
    3. possible numerical features(values need parsing):
        1. parse values
        2. convert units
    4. ratio features: store two numbers in the ratio separately.

In [16]:
def processing_chartevents(aid, admittime, conn, f):
    cur = conn.cursor()
    cur.execute('SELECT charttime,itemid,valuenum,valueuom FROM mimiciii.chartevents WHERE hadm_id = '+str(aid)+' and itemid in (select * from mengcztemp_itemids_valid_chart)')
    chartevents = cur.fetchall()
    wholedata = []
    for ce in chartevents:     
        #check date
        if ce[0] is None:
            print('no starttime: ', ce, file=f)
            continue
    
        #discard none value and none valueuom
        if ce[2] is None:
            print('no value: ', ce, file=f)
            continue
    
        #tuple to list
        ce = list(ce)

        # convert units...
        if ce[3] is None:
            ce[3] = ''
        itemid, valuenum, valueuom = ce[1], ce[2], ce[3]
        valueuom = valueuom.replace(' ', '').lower()
        unitmap = UNITSMAP['chartevents']
        mainunit = allitem_unit[map_itemid_index[itemid][0]]
        ce[3] = mainunit
        if itemid in unitmap.keys():
            dst_value = convert_units(unitmap[itemid], valueuom, mainunit, valuenum, f)
        else:
            if valueuom == mainunit or valueuom == '':
                dst_value = float(valuenum)
            else:
                dst_value = None
        ce[2] = dst_value
        
        #discard none value
        if(ce[2] == None):
            print('not convertible: ', ce, file=f)
            continue
        
        wholedata.append(['ce',(ce[0]-admittime).total_seconds(),list(ce)])
    return wholedata

def processing_chartevents_cate(aid, admittime, conn, f):
    cur = conn.cursor()
    cur.execute('SELECT charttime,itemid,value,valueuom FROM mimiciii.chartevents WHERE hadm_id = '+str(aid)+' and itemid in (select * from mengcztemp_itemids_valid_chart_cate)')
    chartevents = cur.fetchall()
    wholedata = []
    for ce in chartevents: 
        #check date
        if ce[0] is None:
            print('no starttime: ', ce, file=f)
            continue
            
        #discard none value
        if ce[2] is None:
            print('no value: ', ce, file=f)
            continue
        
        #tuple to list
        ce = list(ce)
        
        #map to num
        ce[2] = catedict[ce[1]][ce[2]]
        if ce[2] is None:
            continue
        wholedata.append(['cecate',(ce[0]-admittime).total_seconds(),list(ce)])
    return wholedata
        
def processing_chartevents_num(aid, admittime, conn, f):
    cur = conn.cursor()
    cur.execute('SELECT charttime,itemid,value,valueuom FROM mimiciii.chartevents WHERE hadm_id = '+str(aid)+' and itemid in (select * from mengcztemp_itemids_valid_chart_num)')
    chartevents = cur.fetchall()
    wholedata = []
    # parse numerical values
    for ce in chartevents:
        #tuple to list
        ce = list(ce)
        
        if ce[2] is None:
            print('no value: ', ce, file=f)
            continue
        ce2res = parseNum(ce[2])
        
        #check date
        if(ce[0] == None):
            print('no starttime: ', ce, file=f)
            continue

        #discard none value
        if(ce2res == None):
#             writeline(f,'lenum None value :' + str(le))
            print('not parsed: ', ce, file=f)
            continue
        else:
            ce[2] = ce2res
    
        #check unit
        unitmap = UNITSMAP['chartevents']
        if ce[3] is None:
            ce[3] = ''
        currentunit = ce[3].replace(' ','').replace('<','').replace('>','').replace('=','').lower()
        mainunit = allitem_unit[map_itemid_index[ce[1]][0]]
        if( currentunit == mainunit or currentunit == ''):
            pass
        else:    
            if ce[1] in unitmap.keys():
                ce[2] = convert_units(unitmap[ce[1]], currentunit, mainunit, ce[2], f)
            else:
                if currentunit != mainunit:
                    ce[2] = None
        
        #discard none value
        if(ce[2] == None):
            print('not convertible: ', ce, file=f)
            continue
        
        wholedata.append(['cenum',(ce[0]-admittime).total_seconds(),list(ce)])
    return wholedata

def processing_chartevents_ratio(aid, admittime, conn, f):
    cur = conn.cursor()
    cur.execute('SELECT charttime,itemid,value,valueuom FROM mimiciii.chartevents WHERE hadm_id = '+str(aid)+' and itemid in (select * from mengcztemp_itemids_valid_chart_ratio)')
    chartevents = cur.fetchall()
    wholedata = []
    for ce in chartevents:
        ce = list(ce)
        if ce[0] is None:
            print('no starttime: ', ce, file=f)
            continue
        if ce[2] is None:
            print('no value: ', ce, file=f)
            continue
        try:
            fs = ce[2].split('/')
            f1, f2 = fs[0], fs[1]
            if f1 != '':
                ce[2] = float(f1)
                wholedata.append(['leratio_1', (ce[0] - admittime).total_seconds(), list(ce)])
            if f2 != '':
                ce[2] = float(f2)
                wholedata.append(['leratio_2', (ce[0] - admittime).total_seconds(), list(ce)])
        except:
            print('not parsed: ', ce, file=f)
            continue
    return wholedata
        
# processing_chartevents_ratio(136796, datetime.datetime.now(), getConnection(), f=sys.stdout)

In [17]:
def processing_labevents(aid, admittime, conn, f):
    cur = conn.cursor()
    cur.execute('SELECT charttime,itemid,valuenum,valueuom FROM mimiciii.labevents WHERE hadm_id = '+str(aid)+' and itemid in (select * from mengcztemp_itemids_valid_lab)')
    chartevents = cur.fetchall()
    wholedata = []
    for ce in chartevents:     
        #check date
        if ce[0] is None:
            print('no starttime: ', ce, file=f)
            continue
    
        #discard none value
        if ce[2] is None:
            print('no value: ', ce, file=f)
            continue
    
        #tuple to list
        ce = list(ce)

        # convert units...
        if ce[3] is None:
            ce[3] = ''
        itemid, valuenum, valueuom = ce[1], ce[2], ce[3]
        valueuom = valueuom.replace(' ','').replace('<','').replace('>','').replace('=','').lower()
        unitmap = UNITSMAP['labevents']
        mainunit = allitem_unit[map_itemid_index[itemid][0]]
        ce[3] = mainunit
        if itemid in unitmap.keys():
            dst_value = convert_units(unitmap[itemid], valueuom, mainunit, valuenum, f)
        else:
            if valueuom == mainunit or valueuom == '':
                dst_value = float(valuenum)
            else:
                dst_value = None
        ce[2] = dst_value
        
        #discard none value
        if ce[2] is None:
            print('not convertible: ', ce, file=f)
            continue
        
        wholedata.append(['le',(ce[0]-admittime).total_seconds(),list(ce)])
    return wholedata

def processing_labevents_cate(aid, admittime, conn, f):
    cur = conn.cursor()
    cur.execute('SELECT charttime,itemid,value,valueuom FROM mimiciii.labevents WHERE hadm_id = '+str(aid)+' and itemid in (select * from mengcztemp_itemids_valid_lab_cate)')
    labevents = cur.fetchall()
    wholedata = []
    for le in labevents: 
        #check date
        if le[0] is None:
            print('no starttime: ', le, file=f)
            continue
            
        #discard none value
        if le[2] is None:
            print('no value: ', le, file=f)
            continue
        
        #tuple to list
        le = list(le)
        
        #map to num
        le[2] = catedict[le[1]][le[2]]
        if le[2] is None:
            continue
        wholedata.append(['lecate',(le[0]-admittime).total_seconds(),list(le)])
    return wholedata

def processing_labevents_num(aid, admittime, conn, f):
    cur = conn.cursor()
    cur.execute('SELECT charttime,itemid,value,valueuom FROM mimiciii.labevents WHERE hadm_id = '+str(aid)+' and itemid in (select * from mengcztemp_itemids_valid_lab_num)')
    labevents = cur.fetchall()
    wholedata = []
    # parse numerical values
    for le in labevents:
        #tuple to list
        le = list(le)
        
        if le[0] is None:
            print('no starttime', le, file=f)
            continue
        
        if le[2] is None:
            print('no value: ', le, file=f)
            continue
        
        #translate values
        le2res = parseNum(le[2])

        #discard none value
        if le2res is None:
            print('not parsed: ', le, file=f)
            continue
        else:
            le[2] = le2res
    
        #check unit
        unitmap = UNITSMAP['labevents']
        if le[3] is None:
            le[3] = ''
        currentunit = le[3].replace(' ','').replace('<','').replace('>','').replace('=','').lower()
        mainunit = allitem_unit[map_itemid_index[le[1]][0]]
        if( currentunit == mainunit or currentunit == ''):
            pass
        else:    
            if le[1] in unitmap.keys():
                le[2] = convert_units(unitmap[le[1]], currentunit, mainunit, le[2], f)
            else:
                if currentunit != mainunit:
                    le[2] = None
        
        #discard none value
        if(le[2] == None):
            print('not convertible: ', le, file=f)
            continue
        
        wholedata.append(['lenum',(le[0]-admittime).total_seconds(),list(le)])
    return wholedata

def processing_labevents_ratio(aid, admittime, conn, f):
    cur = conn.cursor()
    cur.execute('SELECT charttime,itemid,value,valueuom FROM mimiciii.labevents WHERE hadm_id = '+str(aid)+' and itemid in (select * from mengcztemp_itemids_valid_lab_ratio)')
    chartevents = cur.fetchall()
    wholedata = []
    for ce in chartevents:
        ce = list(ce)
        if ce[0] is None:
            print('no starttime: ', ce, file=f)
        if ce[2] is None:
            print('no value: ', ce, file=f)
            continue
        try:
            fs = ce[2].split('/')
            f1, f2 = fs[0], fs[1]
            if f1 != '':
                ce[2] = float(f1)
                wholedata.append(['leratio_1', (ce[0] - admittime).total_seconds(), list(ce)])
            if f2 != '':
                ce[2] = float(f2)
                wholedata.append(['leratio_2', (ce[0] - admittime).total_seconds(), list(ce)])
        except:
            print('not parsed: ', ce, file=f)
            continue
    return wholedata

# processing_labevents_ratio(145834, datetime.datetime.now(), getConnection(), sys.stdout)

## Process microbiologyevents

1. Discard records without starttime.
2. Parse dose value in dilution_text. Values which can be parsed only contains '<'/'>'/'=' and numbers.

In [18]:
def processing_microbiologyevents(aid, admittime, conn, f):
    wholedata = []
    for m in valid_microbio:
        cur = conn.cursor()
        sql = 'SELECT charttime,(spec_itemid,org_itemid,ab_itemid),dilution_text,\'uom\' FROM mimiciii.microbiologyevents WHERE hadm_id = '+str(aid)
        m = list(map(str, m))
        if(m[0] == 'None'):
            sql += ' and spec_itemid is null'
        else:
            sql += ' and spec_itemid = '+m[0]
    
        if(m[1] == 'None'):
            sql += ' and org_itemid is null'
        else:
            sql += ' and org_itemid = '+m[1]
        
        if(m[2] == 'None'):
            sql += ' and ab_itemid is null'
        else:
            sql += ' and ab_itemid = '+m[2]

        cur.execute(sql)
        microevents = cur.fetchall()
        for me in microevents: 
            me = list(me)
            for x in range(len(m)):
                try:
                    m[x] = int(m[x])
                except:
                    m[x] = None
            me[1] = tuple(m)
            #checkdate
            if(me[0] == None):
#                 writeline(f,'me date 0 : '+ " : "+str(me))
                print('no starttime: ', me, file=f)
                continue;

            #discard none value
            if(me[2] == None):
#                 writeline(f,'MICRO None value :' + str(me))
                print('no value: ', me, file=f)
                continue;
            
            #tuple to list
            me = list(me)
            
            #formatting 
            dose = me[2]
            dose = dose.replace('<','').replace('>','').replace('=','')
            numVal = None;
            if(dose == ''):
#                 writeline(f,'me parse fail : '+ " : "+str(me))
                print('not parsed: ', me, file=f)
                continue;
            try:
                numVal = float(dose)
            except:
#                 writeline(f,'me parse fail : '+ " : "+str(me))
                print('not parsed: ', me, file=f)
                continue;

            me[2]=numVal;
            
            #discard none value; check again after process
            if(me[2] == None):
#                 writeline(f,'MICRO None value :' + str(me))
                print('not parsed: ', me, file=f)
                continue;
                    
            wholedata.append(['me',(me[0]-admittime).total_seconds(),list(me)])
    return wholedata
            
# processing_microbiologyevents(149416, datetime.datetime.now(), getConnection(), sys.stdout)

## Process prescriptionevents

1. Discard records without starttime.
2. Parse values. Values containing only ','/'<'/'>'/'='/' ' and numbers can be parsed.
3. Discard all none values.
4. Convert units.

In [19]:
def processing_prescriptionevents(aid, admittime, conn, f):
    wholedata = []
    cur = conn.cursor()
    cur.execute('SELECT startdate,formulary_drug_cd,dose_val_rx,dose_unit_rx FROM mimiciii.prescriptions WHERE hadm_id = '+str(aid)+' and formulary_drug_cd in (select * from mengcztemp_itemids_valid_prescript)')
    presevents = cur.fetchall()
    for pe in presevents: 
        #checkdate
        if(pe[0] == None):
#             writeline(f,'pe date 0 : '  + " : "+str(pe))
            print('no starttime: ', pe, file=f)
            continue;
        
        #tuple to list
        pe = list(pe)
        
        if pe[2] is None:
            print('no value: ', pe, file=f)
            continue
        
        # formatting the value
        dose = pe[2]
        dose = dose.replace(',','').replace('<','').replace('>','').replace('=','').replace(' ','')
        numVal = None;
        try:
            numVal = float(dose)
        except:
            if(len(dose.split('-'))==2):
                strs = dose.split('-')
                try:
                    numVal = (float(strs[0]) + float(strs[1]))/2.0
                except:
#                     writeline(f,'pe parse fail : '  + " : "+str(pe))
                    print('not parsed: ', pe, file=f)
                    continue;
            else:
#                 writeline(f,'pe parse fail : '  + " : "+str(pe))
                print('not parsed: ', pe, file=f)
                continue;
        
        pe[2] = numVal;
        
        #discard none value
        if(pe[2] == None):
#             writeline(f,'PRES None value :' + str(pe))
            print('not parsed: ', pe, file=f)
            continue;
            
        #check unit
        # convert units...
        if pe[3] is None:
            pe[3] = ''
        itemid, valuenum, valueuom = pe[1], pe[2], pe[3]
        valueuom = valueuom.replace(' ','').lower()
        mainunit = allitem_unit[map_itemid_index[itemid][0]]
        if valueuom == mainunit or valueuom == '':
            dst_value = float(valuenum)
        else:
            dst_value = None
        pe[2] = dst_value

        #discard none value
        if(pe[2] == None):
#             writeline(f, 'PRES None value :' + str(pe))
            print('not convertible: ', pe, file=f)
            continue;
        
        wholedata.append(['pe',(pe[0]-admittime).total_seconds(),list(pe)])
    return wholedata

processing_prescriptionevents(185777, datetime.datetime.now(), getConnection(), sys.stdout)

[['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'GLAR100I', 6.0, 'UNIT']],
 ['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'BENZ100', 100.0, 'mg']],
 ['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'INSULIN', 0.0, 'UNIT']],
 ['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'GLAR100I', 6.0, 'UNIT']],
 ['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'PRIM263', 52.6, 'mg']],
 ['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'VANC1F', 1000.0, 'mg']],
 ['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'MAGS1I', 1.0, 'gm']],
 ['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'MICROK10', 60.0, 'mEq']],
 ['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'INSULIN', 0.0, 'UNIT']],
 ['pe',
  5378393964.653366,
  [datetime.datetime(2191, 3, 16, 0, 0), 'CLON5', 0.5, 'mg']],
 ['pe',
  5378393964.653366,
  [datetime.date

## Processing

In this step, we generate the matrix of time series for each admission.

1. Discard admissoins without admittime.
2. Collect records from inputevents, outputevents, chartevents, labevents, microbiologyevents and prescriptionevents.
3. For possible conflictions(different values of the same feature occur at the same time):
    1. For numerical values:
        1. For inputevents/outputevents/prescriptions, we use the sum of all conflicting values.
        2. For labevents/chartevents/microbiologyevents, we use the mean of all conflicting values.
    2. For categorical values: we use the value appear first and record that confliction event in the log.
    3. For ratio values: we separate the ratio to two numbers and use the mean of each of them.

In [20]:
# integrate the time series array for a single patient
def processing(aid, f):
    conn = getConnection()
    
    # get admittime
    cur = conn.cursor()
    cur.execute('select admittime from mimiciii.admissions where hadm_id={0}'.format(aid))
    admission = cur.fetchone()
    if admission is None:
        return None
    admittime = admission[0]
    if admittime is None:
        return None
    wholedata = []
    
    # preprocess inputevents
    wholedata.append(processing_inputevents(aid, admittime, conn, f))
    
    # preprocess outputevents
    wholedata.append(processing_outputevents(aid, admittime, conn, f))
    
    # preprocess chartevents
    wholedata.append(processing_chartevents(aid, admittime, conn, f))
    wholedata.append(processing_chartevents_cate(aid, admittime, conn, f))
    wholedata.append(processing_chartevents_num(aid, admittime, conn, f))
    wholedata.append(processing_chartevents_ratio(aid, admittime, conn, f))
    
    # preprocess labevents
    wholedata.append(processing_labevents(aid, admittime, conn, f))
    wholedata.append(processing_labevents_cate(aid, admittime, conn, f))
    wholedata.append(processing_labevents_num(aid, admittime, conn, f))
    wholedata.append(processing_labevents_ratio(aid, admittime, conn, f))
    
    # preprocess microbiologyevents
    wholedata.append(processing_microbiologyevents(aid, admittime, conn, f))
    
    # preprocess prescriptionevents
    wholedata.append(processing_prescriptionevents(aid, admittime, conn, f))
    
    # here is the sparse matrix, order by timestamp
    wholedata = sorted(list(itertools.chain(*wholedata)), key=itemgetter(1))
    
    # transform sparse matrix to matrix
    D = len(allids) + 2
    
    # map time to row
    map_time_index = {}
    index = 0
    for wd in wholedata:
        if(wd[1] not in map_time_index):
            map_time_index[wd[1]] = index
            index += 1
            
    patient = [[None for i in range(D)] for j in range(len(map_time_index))]
    numtodivide = [[0 for i in range(D-2)] for j in range(len(map_time_index))]
#     writeline(f,'len(wholedata) = '+str(len(wholedata)))
#     writeline(f, 'D = '+str(D))
#     writeline(f,'len(patient) = '+str(len(patient)) +' timesteps')

    for wd in wholedata:

        assert patient[ map_time_index[wd[1]] ][D-2] == None or patient[ map_time_index[wd[1]] ][D-2] == wd[1]
        patient[ map_time_index[wd[1]] ][D-2] = wd[1]
        patient[ map_time_index[wd[1]] ][D-1] = aid

        if(wd[0] ==  'ie' or wd[0] ==  'oe' or wd[0] ==  'pe'):
            if(patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]][0]] ==None):
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]][0]] = wd[2][2] 
            else: 
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]][0]] += wd[2][2]

        if(wd[0] ==  'le' or wd[0] == 'ce' or wd[0] == 'me' or wd[0] ==  'lenum' or wd[0] == 'cenum'):
            if wd[2][2] is None:
                print('None value: ', wd, file=f)
            if(patient[map_time_index[wd[1]]] [map_itemid_index[wd[2][1]][0]] == None):
                patient[  map_time_index[wd[1]]  ][  map_itemid_index[wd[2][1]][0]  ] = wd[2][2]
                numtodivide[map_time_index[wd[1]]  ]  [  map_itemid_index[wd[2][1]][0] ] = 1
            else: 
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]][0]] += wd[2][2]
                numtodivide[ map_time_index[wd[1]]  ] [   map_itemid_index[wd[2][1]][0] ] += 1
                
        if (wd[0].startswith('ceratio') or wd[0].startswith('leratio')):
            ot = int(wd[0].split('_')[1]) - 1
            if wd[2][2] is None:
                print(wd, file=f)
            if(patient[map_time_index[wd[1]]] [map_itemid_index[wd[2][1]][ot]] == None):
                patient[  map_time_index[wd[1]]  ][  map_itemid_index[wd[2][1]][ot]  ] = wd[2][2]
                numtodivide[map_time_index[wd[1]]  ]  [  map_itemid_index[wd[2][1]][ot] ] = 1
            else: 
                patient[map_time_index[wd[1]]][map_itemid_index[wd[2][1]][ot]] += wd[2][2]
                numtodivide[ map_time_index[wd[1]]  ] [   map_itemid_index[wd[2][1]][ot] ] += 1
        
        if(wd[0] == 'cecate' or wd[0] == 'lecate'):
            if(patient[map_time_index[wd[1]]] [map_itemid_index[wd[2][1]][0]] == None):
                patient[  map_time_index[wd[1]]  ][  map_itemid_index[wd[2][1]][0]  ] = wd[2][2]
            else: 
                print('DUPLICATED :',wd, file=f)
        
    for i in range(len(map_time_index)):
        for j in range(D-2):
            if(numtodivide[i][j] == 0): continue;
            try:
                patient[i][j] /= numtodivide[i][j]
            except:
                print('div error: ', i, j, file=f)
    conn.close()
    return patient

## Generate information of patient

Here we collect information of one patient, containing its admission_type, admission_location, insurance, language, religion, marital_status and ethnicity.

Since all of them are categorical features, we map the strings of each feature to integers and store the mapping.

In [21]:
# generate general information of patient

# generate map for categorical values
conn = getConnection()
cate = ['admission_type','admission_location','insurance','language','religion','marital_status','ethnicity']
mapping = {}
for c in cate:
    cur = conn.cursor()
    cur.execute('select distinct '+c+' from mimiciii.admissions')
    types = cur.fetchall()
    
    catemapping = {}
    for i in range(len(types)):
        catemapping[types[i][0]] = i 
    mapping[c] = catemapping

# add map for services
cur = conn.cursor()
cur.execute('select distinct ' + 'curr_service' + ' from mimiciii.services')
types = cur.fetchall()

catemapping = {}
for i, typen in enumerate(types):
    catemapping[typen[0]] = i
mapping['curr_service'] = catemapping
# mapping
mapping['curr_service']
np.save('res/adm_catemappings.npy', mapping)
mapping

{'admission_type': {'ELECTIVE': 0, 'URGENT': 1, 'EMERGENCY': 2, 'NEWBORN': 3},
 'admission_location': {'EMERGENCY ROOM ADMIT': 0,
  'TRANSFER FROM HOSP/EXTRAM': 1,
  'TRANSFER FROM OTHER HEALT': 2,
  'CLINIC REFERRAL/PREMATURE': 3,
  '** INFO NOT AVAILABLE **': 4,
  'TRANSFER FROM SKILLED NUR': 5,
  'TRSF WITHIN THIS FACILITY': 6,
  'HMO REFERRAL/SICK': 7,
  'PHYS REFERRAL/NORMAL DELI': 8},
 'insurance': {'Self Pay': 0,
  'Medicare': 1,
  'Medicaid': 2,
  'Private': 3,
  'Government': 4},
 'language': {None: 0,
  'LAOT': 1,
  'CANT': 2,
  '* FU': 3,
  '*ARM': 4,
  'ENGL': 5,
  '*RUS': 6,
  '*TOY': 7,
  'TAGA': 8,
  '*NEP': 9,
  'URDU': 10,
  '*DUT': 11,
  '*CHI': 12,
  'CAMB': 13,
  'POLI': 14,
  '* BE': 15,
  'PORT': 16,
  '*GUJ': 17,
  '*LEB': 18,
  '*BUL': 19,
  '*ARA': 20,
  'SERB': 21,
  'HAIT': 22,
  'SPAN': 23,
  '*PUN': 24,
  'MAND': 25,
  'GREE': 26,
  'CAPE': 27,
  'PTUN': 28,
  'RUSS': 29,
  'SOMA': 30,
  '*AMH': 31,
  '*DEA': 32,
  '*BEN': 33,
  'TURK': 34,
  '*BOS': 35,
  

## Generate non-temporal features

Here we collect all non-temporal features only related to the admissions:
1. admission id
2. subject id(for finding the patient of one admission)
3. age(at admittime, unit is day)
4. length of stay(unit is minute)
5. in-hospital mortality label
6. labelGurantee label
7. 1-day mortality(from admittime)
8. 2-day mortality(from admittime)
9. 3-day mortality(from admittime)
10. 30-day mortality(from dischtime)
11. 1-year mortality(from dischtime)
12. admission_type
13. admission_location
14. insurance
15. language
16. religion
17. marital_status
18. ethnicity

**Mortality label here is not used, please refer to 8_collect_time_labels.ipynb to get correct mortality labels. We leave them here only for compatibility.**

In [22]:
def ageLosMortality(aid, f):
    conn = getConnection()
    
    cur = conn.cursor()
    cur.execute('SELECT hadm_id,subject_id,admittime,dischtime,deathtime,admission_type,admission_location,insurance,language,religion,marital_status,ethnicity FROM mimiciii.ADMISSIONS WHERE hadm_id='+str(aid))
    admission = cur.fetchone()
    
    assert admission!=None
    
    subject_id = admission[1]
    admittime = admission[2]
    dischtime = admission[3]
    deathtime = admission[4]
    
    cur = conn.cursor()
    cur.execute('SELECT dob, dod FROM mimiciii.PATIENTS WHERE subject_id='+str(subject_id))
    patient = cur.fetchone()

    assert patient!=None
    birthdate = patient[0]
    final_deathdate = patient[1]
    mortal = 0
    labelGuarantee = 0
    die24 = 0
    die24_48 = 0
    die48_72 = 0
    die30days = 0
    die1year = 0
    if(deathtime != None) : 
        mortal = 1
        if(deathtime != dischtime):
            labelGuarantee=1
        secnum = (deathtime - admittime).total_seconds()
        if secnum <= 24 * 60 * 60:
            die24 = 1
        if secnum <= 48 * 60 * 60:
            die24_48 = 1
        if secnum <= 72 * 60 * 60:
            die48_72 = 1
    if dischtime is not None and final_deathdate is not None:
        dischsecnum = (final_deathdate - dischtime).total_seconds()
        if dischsecnum <= 30 * 24 * 60 * 60:
            die30days = 1
        if dischsecnum <= 365 * 24 * 60 * 60:
            die1year = 1
            
    cur.execute('select curr_service from mimiciii.services where hadm_id='+str(aid))
    curr_service = cur.fetchone()
    if curr_service:
        curr_service = curr_service[0]
    else:
        curr_service = 'NB'

    data = [aid,subject_id, (admittime - birthdate).total_seconds()/(3600*24), (dischtime-admittime).total_seconds()//60., mortal, labelGuarantee, die24, die24_48, die48_72, die30days, die1year, mapping['curr_service'][curr_service]]
    for i in range(5,12):
        data.append(mapping[cate[i-5]][admission[i]])
    conn.close()
    return data

# ageLosMortality(128652, sys.stdout)

## Generate ICD9 codes

Here we convert icd9 codes to category numbers.

In [23]:
def ICD9(aid, f):
    conn = getConnection()
    cate20=0
    
    cur = conn.cursor()
    cur.execute('SELECT icd9_code FROM mimiciii.DIAGNOSES_ICD WHERE hadm_id='+str(aid)+' ORDER BY seq_num')
    icd9s = cur.fetchall() 
    list_icd9 = []
    for icd9 in icd9s:
        icd = icd9[0];
        if icd is None:
            continue
        if(icd[0] =='V'):
            label_name = 19
            numstr = icd[0:3]+'.'+icd[3:len(icd)]
        elif(icd[0] =='E'):
            cate20 +=1
            label_name = 20
            numstr = icd
        else:
            num = float(icd[0:3])
            numstr = icd[0:3]+'.'+icd[3:len(icd)]
            if(num >= 1 and num <= 139 ) :
                label_name = 0
            if(num >= 140 and num <= 239 ) :
                label_name = 1
            if(num >= 240 and num <= 279 ) :
                label_name = 2
            if(num >= 280 and num <= 289 ) :
                label_name = 3
            if(num >= 290 and num <= 319 ) :
                label_name = 4
            if(num >= 320 and num <= 389 ) :
                label_name = 5
            if(num >= 390 and num <= 459 ) :
                label_name = 6
            if(num >= 460 and num <= 519 ) :
                label_name = 7
            if(num >= 520 and num <= 579 ) :
                label_name = 8
            if(num >= 580 and num <= 629 ) :
                label_name = 9
            if(num >= 630 and num <= 677 ) :
                label_name = 10
            if(num >= 680 and num <= 709 ) :
                label_name = 11
            if(num >= 710 and num <= 739 ) :
                label_name = 12
            if(num >= 740 and num <= 759 ) :
                label_name = 13
            if(num >= 760 and num <= 779 ) :
                label_name = 14
            if(num >= 780 and num <= 789 ) :
                label_name = 15
            if(num >= 790 and num <= 796 ) :
                label_name = 16
            if(num >= 797 and num <= 799 ) :
                label_name = 17
            if(num >= 800 and num <= 999 ) :
                label_name = 18
        list_icd9.append([aid,icd,numstr,label_name])
    conn.close()
    return list_icd9

# ICD9(185777, sys.stdout)

In [24]:
if os.path.exists('admdata/'):
    shutil.rmtree('admdata/')
os.makedirs('admdata/log/')

## Save one file for each admission

For each admission, we save a separate file for it, which contains:
1. 'timeseries': matrix of time series in form of sparse matrix
2. 'general': non-temporal features
3. 'icd9': list of icd9 category codes

In [25]:
def process_patient(aid):
    with open('admdata/log/adm-{0}.log'.format(str('%.6d' % aid)), 'w') as f:
        try:
            proc = processing(aid, f)
            if len(proc) == 0:
                return
            res = {
                'timeseries': sparsify(proc),
                'general': ageLosMortality(aid, f),
                'icd9': ICD9(aid, f)
            }
            np.save('admdata/adm-' + str('%.6d' % aid), res)
#             print('finished {0}!'.format(aid))
        except Exception as e:
            with open('admdata/log/admerror-{0}.log'.format(str('%.6d' % aid)), 'w') as ferr:
                traceback.print_exc(file=ferr)
            traceback.print_exc(sys.stdout)
            print('failed at {0}!'.format(aid))
    
process_patient(136796)

In [26]:
def process_patient_list(aid_list):
    for aid in tqdm(aid_list):
        process_patient(aid)

In [None]:
p = Pool(num_workers)
for aid_list in np.array_split(admission_ids, num_workers):
    p.apply_async(process_patient_list, args=(aid_list,))
p.close()
p.join()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3686.0), HTML(value='')))

In [None]:
# add labels about mortality, now we have 1day|2days|3days|in-hospitial|30days|1year
def add_mortality_labels(aid):
    with open('admdata/log/adm-addml{0}.log'.format(str('%.6d' % aid)), 'w') as f:
        try:
            res = np.load('admdata/adm-' + str('%.6d' % aid)+'.npy', allow_pickle=True).tolist()
        except Exception as e:
            traceback.print_exc(file=f)
            return
        res['general'] = ageLosMortality(aid, f)
        np.save('admdata/adm-' + str('%.6d' % aid)+'.npy', res)
#     print('finished {0}!'.format(aid))

    
def add_mortality_labels_list(aid_list):
    for aid in tqdm(aid_list):
        add_mortality_labels(aid)

        
p = Pool(num_workers)
for aid_list in np.array_split(admission_ids, num_workers):
    p.apply_async(add_mortality_labels_list, args=(aid_list))
p.close()
p.join()