# Filter Itemid Input

This script is used for filtering itemids from TABLE INPUTEVENTS.

1. We check number of units of each itemid and choose the major unit as the target of unit conversion.
2. In this step we do not apply any filtering to the data.

## Output

1. itemid of observations for inputevents.
2. unit of measurement for each itemid.

In [1]:
from __future__ import print_function

import psycopg2
import datetime
import sys
from operator import itemgetter, attrgetter, methodcaller
import numpy as np
import itertools
import os.path
import matplotlib.pyplot as plt
import math
from multiprocessing import Pool

from utils import getConnection

%matplotlib inline

In [2]:
num_workers = 4

In [3]:
try:
    conn = getConnection()
    print('Connected to Postgre Database!')
except:
    print('Fail to connect!')
    
_adm = np.load('res/admission_ids.npy', allow_pickle=True).tolist()
admission_ids = _adm['admission_ids']
admission_ids_txt = _adm['admission_ids_txt']

db = np.load('res/itemids.npy', allow_pickle=True).tolist()
input_itemid = db['input']
output_itemid = db['output']
chart_itemid = db['chart']
lab_itemid = db['lab']
microbio_itemid = db['microbio']
prescript_itemid = db['prescript']

Connected to Postgre Database!


In [4]:
valid_input = []
valid_input_unit = []

Compare the speed between two ways of query. In order to accelerate, we manually create a TABLE ADMISSION_IDS to store all admission_ids.

In [5]:
cur = conn.cursor()
start = datetime.datetime.now()
for t in range(10):
    cur = conn.cursor()
    cur.execute('select coalesce(amountuom, \'\'), count(*) from mimiciii.inputevents_cv where itemid=30044 and hadm_id in (select * from admission_ids) group by amountuom')
    res = cur.fetchall()
end = datetime.datetime.now()
print(end - start)
print(res)

start = datetime.datetime.now()
for t in range(10):
    cur = conn.cursor()
    cur.execute('select coalesce(amountuom, \'\'), count(*) from mimiciii.inputevents_cv where itemid=30044 and hadm_id in ({0}) group by amountuom'.format(admission_ids_txt))
    res = cur.fetchall()
end = datetime.datetime.now()
print(end - start)
print(res)

0:00:00.133987
[('', 315), ('mg', 160)]
0:00:03.467349
[('', 315), ('mg', 160)]


In [6]:
# inputevents
def stat_inputevents_unit_task(itemid, admission_ids_txt):
    tconn = getConnection()
    tcur = tconn.cursor()
#     tcur.execute('SELECT amountuom, count(amountuom) FROM mimiciii.inputevents_cv \
#                 WHERE amountuom is not null and itemid = '+ str(itemid) +' and hadm_id in ('+admission_ids_txt+') group by amountuom')
#     tcur.execute('select coalesce(amountuom, \'\'), count(*) from (select amountuom, itemid, hadm_id from mimiciii.inputevents_cv union select amountuom, itemid, hadm_id from mimiciii.inputevents_mv) \
#         where itemid={0} and hadm_id in (select hadm_id from admission_ids) group by amountuom'.format(itemid))
    tcur.execute('select amountuom, sum(count::int) from (\
                    select coalesce(amountuom, \'\') as amountuom, count(*) from mimiciii.inputevents_cv where itemid = {0} and hadm_id in (select * from admission_ids) group by amountuom\
                    union all\
                    select coalesce(amountuom, \'\') as amountuom, count(*) from mimiciii.inputevents_mv where itemid = {0} and hadm_id in (select * from admission_ids) group by amountuom\
                    ) as t where amountuom<>\'\' group by amountuom'.format(itemid))
    outputunits = tcur.fetchall()
    outputunits = sorted(outputunits, key=lambda tup: tup[1])
    outputunits.reverse()
    total = 0
    for o in outputunits:
        total += o[1]
    if(total == 0 ):
        return (itemid, None, None)
    percentage = float(outputunits[0][1]) / total *100.0
    tconn.close()
    return (itemid, percentage, outputunits)

p = Pool(num_workers)
valid_vupairs = [p.apply_async(stat_inputevents_unit_task, args=(i, admission_ids_txt)) for i in input_itemid]
p.close()
p.join()
valid_vupairs = [x.get() for x in valid_vupairs]

## iterate thru each itemID
For each item id, we count number of observations for each unit of measurement.

For example,
IN 225883 : 98.24 : 3 : [('dose', 16477L), ('mg', 251L), ('grams', 44L)]
This means that for itemid 225883, there are:
1. 16477 records using dose as its unit of measurement.
2. 251 records using mg as its unit of measurement.
3. 44 records using grams as its unit of measurement.

dose has 98.24% over all the observations for this itemid, we can say that dose is a majority unit. 
1. We will keep this itemid because 98% is high. we can relatively safe to discard the observations that has different unit of measurement. i.e. if we discard mg and grams, we lose 251+44 records which is little, compared to 16477 records we can keep.
2. We will record main unit of measurement for this itemID as dose.

In [7]:
valid_vupairs = [x for x in valid_vupairs if x[1] is not None]
valid_vupairs_des = sorted(valid_vupairs, key=lambda x: x[1])
for itemid, percentage, outputunits in valid_vupairs_des:
    print("IN "+str(itemid) + "\t" + "{:.2f}".format(percentage) + "\t" + str(len(outputunits))+" : "+ str(outputunits))
    
np.save('res/filtered_input_raw.npy', {'raw': valid_vupairs})

IN 30174	52.13	2 : [('ml', 171), ('mg', 157)]
IN 30384	66.47	2 : [('ml', 1982), ('mg', 1000)]
IN 225845	69.42	3 : [('dose', 1024), ('mg', 450), ('grams', 1)]
IN 30046	72.65	2 : [('ml', 85), ('mg', 32)]
IN 30069	73.28	2 : [('mg', 1901), ('ml', 693)]
IN 225910	75.12	2 : [('dose', 20669), ('mg', 6847)]
IN 221744	79.95	2 : [('mcg', 69025), ('mg', 17315)]
IN 30089	82.37	2 : [('tsp', 121501), ('gm', 26008)]
IN 30151	84.20	2 : [('mg', 1183), ('ml', 222)]
IN 30135	84.43	2 : [('mg', 1166), ('ml', 215)]
IN 30298	86.73	4 : [('mEQ', 8644), ('U', 800), ('mEq', 502), ('ml', 21)]
IN 225914	87.16	4 : [('mcg', 95), ('dose', 8), ('mg', 4), ('pg', 2)]
IN 225913	88.26	3 : [('dose', 880), ('mg', 116), ('grams', 1)]
IN 30022	89.86	2 : [('mg', 647), ('ml', 73)]
IN 228003	90.41	2 : [('dose', 66), ('mg', 7)]
IN 30148	90.65	2 : [('mg', 291), ('ml', 30)]
IN 30177	92.11	2 : [('mg', 3246), ('ml', 278)]
IN 30114	93.43	2 : [('mg', 25355), ('ml', 1783)]
IN 225866	94.60	2 : [('dose', 526), ('mg', 30)]
IN 30178	94.65	2

IN 43799	100.00	1 : [('ml', 8)]
IN 44276	100.00	1 : [('ml', 5)]
IN 42938	100.00	1 : [('ml', 1)]
IN 46150	100.00	1 : [('ml', 8)]
IN 30315	100.00	1 : [('ml', 239)]
IN 44415	100.00	1 : [('ml', 4)]
IN 40646	100.00	1 : [('ml', 73)]
IN 46797	100.00	1 : [('ml', 7)]
IN 46039	100.00	1 : [('ml', 3)]
IN 43540	100.00	1 : [('ml', 12)]
IN 42470	100.00	1 : [('ml', 32)]
IN 41841	100.00	1 : [('ml', 7)]
IN 41352	100.00	1 : [('ml', 1)]
IN 30092	100.00	1 : [('ml', 5)]
IN 42172	100.00	1 : [('ml', 6)]
IN 41845	100.00	1 : [('ml', 20)]
IN 43387	100.00	1 : [('ml', 1)]
IN 45432	100.00	1 : [('ml', 3)]
IN 42875	100.00	1 : [('ml', 2)]
IN 40667	100.00	1 : [('ml', 19)]
IN 45478	100.00	1 : [('ml', 91)]
IN 42587	100.00	1 : [('ml', 1)]
IN 42059	100.00	1 : [('ml', 52)]
IN 46753	100.00	1 : [('ml', 2)]
IN 45380	100.00	1 : [('ml', 24)]
IN 41765	100.00	1 : [('ml', 5)]
IN 41550	100.00	1 : [('ml', 16)]
IN 44770	100.00	1 : [('ml', 1)]
IN 46682	100.00	1 : [('ml', 8)]
IN 40795	100.00	1 : [('ml', 25)]
IN 45504	100.00	1 : [('ml', 

In [8]:
conn = getConnection()
sql = 'select hadm_id, amountuom, count(amountuom) from mimiciii.inputevents_cv where itemid={0} group by hadm_id, amountuom\
 union all select hadm_id, amountuom, count(amountuom) from mimiciii.inputevents_mv where itemid={0} group by hadm_id, amountuom order by hadm_id'
for itemid in [x[0] for x in valid_vupairs_des[:14]]:
    cur = conn.cursor()
    cur.execute(sql.format(itemid))
    results = cur.fetchall()
    print('IN', itemid)
    print('hadm_id\t\tamountuom\tcount')
    for res in results:
        print('\t\t'.join(map(str, res)))
    print()

IN 30174
hadm_id		amountuom	count
104843		mg		13
104843		None		0
109968		ml		20
109968		None		0
118498		mg		8
118498		None		0
120938		None		0
120938		ml		60
128300		ml		23
128300		None		0
129697		None		0
129697		mg		39
150402		None		0
150402		mg		68
158121		None		0
158121		ml		47
160910		None		0
160910		mg		14
171968		mg		11
171968		None		0
183707		mg		4
183707		None		0
195234		ml		21
195234		None		0

IN 30384
hadm_id		amountuom	count
100262		mg		98
100262		None		0
108031		ml		37
112631		None		0
112631		mg		84
113137		ml		66
113137		None		0
114467		mg		76
114467		None		0
117034		None		0
117034		ml		1
117638		None		0
117638		ml		98
120327		ml		100
120327		None		0
121169		ml		97
121169		None		0
126042		ml		29
128483		ml		9
128640		ml		89
129043		None		0
129043		ml		83
129486		None		0
129486		ml		73
129650		mg		16
129650		None		0
132680		ml		27
134935		ml		40
134935		None		0
136272		None		0
136272		ml		83
139183		ml		22
140605		ml		24
142153		None		0
142153		ml		19
143826		mg		84
143826		

198985		None		0

IN 30069
hadm_id		amountuom	count
100563		mg		11
100566		mg		20
101204		mg		19
102322		ml		20
102322		None		0
103075		None		0
103075		ml		21
103537		mg		22
103882		mg		14
104694		ml		12
105036		mg		23
106397		mg		28
106531		mg		14
106608		mg		17
108739		mg		25
109919		mg		25
110304		ml		17
111634		mg		26
112026		mg		17
113266		mg		14
113441		mg		50
114132		mg		13
115245		mg		32
115612		mg		14
117866		ml		15
117866		None		0
118207		mg		16
119058		mg		21
120232		mg		9
120515		mg		18
121546		None		0
121546		ml		4
121640		None		0
121640		ml		16
123358		mg		17
123815		mg		21
124463		ml		26
124463		None		0
125932		mg		23
126486		mg		18
126902		mg		46
127202		mg		38
128676		mg		25
128819		None		0
128819		ml		42
129175		None		0
129175		ml		3
129922		None		0
129922		ml		22
130736		None		0
130736		ml		17
131719		mg		19
131853		mg		33
132343		mg		18
132422		mg		14
133004		mg		16
134459		mg		20
135412		mg		14
136463		mg		8
136553		mg		7
137036		mg		20
137099		ml		28
137099		mg		44

138623		dose		4
138635		dose		2
138648		dose		1
138677		dose		3
138678		dose		1
138690		dose		1
138690		mg		1
138719		dose		4
138750		mg		11
138750		dose		1
138752		mg		11
138760		dose		3
138774		dose		2
138785		dose		1
138785		mg		9
138790		dose		2
138793		dose		1
138823		dose		10
138831		dose		3
138836		mg		1
138866		dose		39
138914		dose		2
138915		dose		2
138939		dose		5
138943		dose		4
138964		dose		1
138979		dose		1
138981		dose		1
138985		dose		1
138999		dose		7
139023		mg		9
139023		dose		1
139036		dose		2
139055		dose		1
139061		dose		2
139067		mg		9
139070		dose		5
139102		dose		4
139129		dose		7
139157		dose		4
139181		dose		3
139218		dose		1
139227		mg		8
139269		dose		1
139272		dose		1
139272		mg		5
139298		dose		1
139329		dose		1
139354		dose		3
139400		dose		1
139427		dose		10
139446		dose		1
139476		dose		2
139493		dose		2
139494		dose		3
139505		dose		1
139536		mg		8
139550		dose		1
139565		dose		3
139580		mg		4
139580		dose		20
139583		dose		2
139620		dose		1
139644		

180499		dose		5
180501		dose		2
180502		dose		1
180503		dose		2
180533		mg		1
180549		dose		2
180551		dose		4
180577		dose		8
180614		dose		1
180634		dose		1
180665		dose		18
180693		dose		1
180702		dose		1
180714		dose		14
180716		mg		8
180716		dose		5
180726		dose		6
180732		dose		2
180740		dose		1
180743		dose		1
180749		dose		1
180749		mg		5
180758		mg		1
180763		dose		3
180787		dose		1
180805		dose		4
180806		dose		1
180834		dose		1
180858		dose		4
180861		dose		3
180870		dose		2
180874		dose		2
180880		dose		1
180916		dose		1
180920		dose		1
180922		dose		1
180943		dose		2
180961		dose		3
180979		mg		5
180985		dose		1
181001		mg		3
181010		dose		13
181021		dose		2
181080		dose		1
181092		dose		1
181113		dose		5
181151		dose		9
181161		dose		2
181183		dose		1
181194		dose		1
181201		dose		2
181224		dose		1
181231		dose		1
181232		dose		1
181236		dose		3
181308		dose		1
181315		dose		8
181321		dose		2
181355		dose		2
181361		dose		8
181377		dose		2
181397		dose		2
181409		dose		2
1

IN 221744
hadm_id		amountuom	count
100010		mcg		1
100011		mcg		33
100016		mcg		2
100035		mcg		15
100047		mcg		6
100061		mcg		2
100068		mcg		7
100094		mcg		3
100098		mcg		50
100099		mcg		1
100104		mcg		44
100116		mcg		9
100138		mcg		20
100141		mg		66
100141		mcg		5
100165		mcg		3
100202		mcg		1
100210		mcg		12
100215		mcg		8
100217		mcg		31
100223		mcg		29
100227		mcg		12
100234		mcg		1
100263		mcg		6
100271		mcg		6
100277		mcg		8
100286		mcg		15
100292		mcg		5
100292		mg		43
100320		mcg		2
100325		mcg		3
100325		mg		8
100336		mg		2
100350		mcg		5
100357		mcg		3
100361		mcg		13
100406		mcg		1
100416		mcg		74
100423		mg		4
100442		mcg		31
100445		mcg		7
100449		mcg		30
100509		mcg		3
100549		mcg		1
100579		mcg		4
100583		mcg		5
100619		mcg		181
100622		mcg		4
100659		mcg		1
100689		mcg		1
100696		mcg		18
100715		mcg		7
100732		mcg		11
100749		mcg		28
100758		mcg		2
100774		mcg		2
100779		mcg		54
100784		mcg		22
100808		mcg		18
100846		mcg		22
100877		mg		10
100881		mcg		2
100890		mcg		37

144067		mg		19
144085		mcg		7
144131		mcg		25
144131		mg		23
144152		mg		33
144175		mcg		3
144188		mcg		9
144188		mg		45
144207		mcg		23
144209		mcg		12
144260		mcg		42
144310		mg		2
144314		mcg		25
144353		mcg		183
144393		mcg		2
144413		mcg		1
144423		mcg		2
144426		mcg		14
144435		mg		158
144435		mcg		96
144442		mcg		3
144465		mcg		1
144475		mcg		1
144522		mcg		8
144564		mcg		15
144589		mcg		12
144591		mcg		21
144609		mcg		9
144631		mcg		6
144643		mcg		5
144689		mg		16
144689		mcg		1
144748		mcg		15
144766		mg		21
144766		mcg		7
144791		mcg		4
144828		mcg		1
144844		mcg		2
144912		mcg		118
144913		mcg		7
144946		mcg		1
144952		mcg		8
144957		mcg		20
144958		mcg		3
144987		mcg		15
145017		mcg		2
145069		mcg		15
145082		mg		15
145082		mcg		20
145084		mcg		1
145166		mg		9
145177		mcg		5
145207		mcg		1
145215		mcg		8
145218		mcg		4
145231		mcg		5
145234		mcg		26
145235		mcg		9
145236		mcg		60
145245		mcg		47
145250		mcg		1
145257		mcg		2
145271		mg		5
145303		mcg		27
145307		mcg		58
145

181990		mcg		19
181992		mg		12
181992		mcg		4
182000		mcg		1
182003		mcg		2
182008		mcg		21
182011		mcg		4
182019		mcg		7
182023		mg		3
182035		mg		80
182035		mcg		12
182054		mcg		2
182083		mcg		5
182113		mcg		19
182113		mg		21
182133		mg		5
182133		mcg		1
182149		mg		37
182149		mcg		8
182163		mcg		9
182164		mcg		6
182166		mcg		2
182206		mcg		107
182219		mcg		1
182221		mcg		9
182221		mg		10
182225		mg		1
182225		mcg		1
182233		mcg		6
182237		mcg		40
182239		mcg		1
182281		mcg		3
182300		mcg		5
182370		mcg		2
182380		mg		1
182393		mcg		4
182410		mcg		8
182444		mcg		1
182458		mcg		6
182481		mcg		18
182484		mcg		2
182505		mcg		4
182538		mcg		6
182546		mcg		3
182599		mcg		14
182656		mcg		1
182660		mg		1
182669		mcg		6
182669		mg		7
182690		mcg		1
182691		mcg		3
182698		mcg		2
182710		mcg		1
182711		mcg		5
182727		mcg		4
182747		mcg		11
182750		mcg		11
182793		mg		3
182830		mcg		2
182844		mcg		3
182845		mcg		3
182849		mg		20
182849		mcg		5
182862		mcg		10
182878		mcg		2
182878		mg		7
182915

IN 30089
hadm_id		amountuom	count
100096		tsp		200
100135		tsp		380
100498		tsp		49
100545		tsp		160
100585		tsp		119
100591		tsp		142
100647		tsp		10
100665		tsp		9
100752		tsp		9
100753		gm		139
101015		tsp		493
101017		tsp		5
101019		gm		320
101057		gm		88
101420		tsp		290
101423		tsp		237
101598		tsp		35
101721		tsp		123
101722		tsp		355
101805		tsp		10
101885		tsp		54
102113		tsp		68
102158		tsp		13
102251		tsp		264
102360		tsp		29
102497		gm		74
102729		tsp		390
102744		tsp		136
102762		tsp		150
102809		tsp		297
102833		gm		119
102850		gm		83
102870		gm		41
103059		gm		18
103074		gm		74
103080		tsp		41
103143		tsp		38
103304		tsp		16
103333		tsp		23
103338		tsp		173
103348		tsp		148
103366		tsp		45
103388		gm		68
103404		tsp		10
103515		tsp		241
103714		tsp		144
103749		tsp		148
103838		tsp		213
103905		tsp		363
104163		tsp		104
104207		tsp		203
104208		tsp		48
104248		tsp		470
104397		tsp		158
104438		tsp		49
104445		tsp		447
104503		tsp		222
104683		tsp		36
104716		tsp		96
1048

186525		mEQ		56
186550		mEQ		22
186834		mEQ		14
187183		mEQ		13
187207		mEQ		96
187531		mEQ		20
188355		mEQ		10
189250		mEQ		37
189747		mEQ		28
189790		mEQ		7
189794		mEQ		1
190267		mEQ		8
190659		mEQ		30
190982		mEQ		22
191400		mEQ		39
191461		mEQ		3
191956		U		112
191956		mEq		45
192282		mEQ		12
192431		mEQ		5
192791		mEQ		52
192848		mEQ		45
192992		ml		7
193138		mEQ		53
193174		mEQ		24
193871		mEQ		53
194207		mEQ		28
195337		mEQ		12
195588		mEQ		32
196050		mEQ		7
196168		mEQ		20
196570		mEQ		9
196700		mEQ		22
196728		mEQ		11
197480		mEQ		24
197727		mEQ		3
198198		mEQ		26
198274		mEQ		13
198497		mEQ		32
198710		mEQ		28
198741		mEQ		6
198753		mEQ		32
199389		mEQ		39
199681		mEQ		21
199842		mEQ		7

IN 225914
hadm_id		amountuom	count
100977		mcg		2
103584		dose		3
104127		mcg		1
111964		mcg		3
118437		pg		1
125597		mcg		1
131825		mcg		2
133622		dose		1
134366		mcg		2
135912		mcg		67
139801		mcg		2
144091		mg		1
145282		dose		1
145282		mcg		1
146974		mcg		1
150851		dose		2
150851		mg		2


In [9]:
valid_vupairs = np.load('res/filtered_input_raw.npy', allow_pickle=True).tolist()['raw']
valid_input = [x[0] for x in valid_vupairs]
valid_input_unit = [x[2][0][0] for x in valid_vupairs]
print(valid_input, valid_input_unit)

np.save('res/filtered_input.npy',{'id':valid_input,'unit':valid_input_unit})
print('saved!')

[225936, 30044, 46770, 42596, 44526, 43038, 40327, 225883, 43829, 43750, 225931, 43007, 40743, 30001, 43744, 41390, 30138, 45865, 46057, 44064, 46387, 222051, 45915, 44407, 43081, 45333, 42576, 46583, 45929, 46630, 45640, 42456, 44539, 45090, 43764, 46496, 46362, 45741, 44673, 45803, 46564, 45505, 225991, 42344, 30320, 41358, 45166, 41776, 45270, 43023, 44605, 41357, 30332, 44061, 43474, 40645, 44690, 30119, 44945, 42575, 46101, 41712, 45771, 221456, 46002, 225876, 44505, 46264, 30166, 42999, 45073, 45787, 46026, 44610, 45838, 45125, 44292, 42083, 225860, 46218, 42554, 227535, 41194, 43087, 46040, 45802, 30148, 42438, 42629, 44112, 45214, 40880, 45683, 44054, 44527, 45688, 44818, 45912, 45675, 45182, 45863, 45114, 45458, 42429, 30375, 41216, 45588, 45871, 30399, 45275, 41356, 41913, 45230, 44144, 45587, 46093, 46419, 225973, 40030, 42674, 45956, 221319, 44795, 43845, 45317, 44182, 41449, 42065, 43835, 45627, 43957, 42230, 45187, 46643, 44696, 46610, 42208, 45765, 41094, 40850, 45580, 4

In [10]:
print(len(admission_ids))

58976
