# Filter Itemid Lab

This script is used for filtering itemids from TABLE LABEVENTS.

1. We check number of units of each itemid and choose the major unit as the target of unit conversion.
2. In this step we get 3 kinds of features:
    - numerical features
    - categorical features
    - ratio features, this usually happens in blood pressure measurement, such as "135/70".

## Output

1. itemid of observations for labevents.
2. unit of measurement for each itemid.

In [1]:
from __future__ import print_function

import psycopg2
import datetime
import sys
import re
from operator import itemgetter, attrgetter, methodcaller
import numpy as np
import itertools
import os.path
import matplotlib.pyplot as plt
import math
from multiprocessing import Pool, cpu_count

from utils import getConnection

%matplotlib inline

In [2]:
num_workers = 4

In [3]:
conn = getConnection()
    
_adm = np.load('res/admission_ids.npy', allow_pickle=True).tolist()
admission_ids = _adm['admission_ids']
admission_ids_txt = _adm['admission_ids_txt']

db = np.load('res/itemids.npy', allow_pickle=True).tolist()
input_itemid = db['input']
output_itemid = db['output']
chart_itemid = db['chart']
lab_itemid = db['lab']
microbio_itemid = db['microbio']
prescript_itemid = db['prescript']

In [4]:
def stat_lab_unit_task(i, admission_ids_txt):
    conn = getConnection()
    cur = conn.cursor()
    cur.execute('SELECT coalesce(valueuom,\'\'), count(*) FROM mimiciii.labevents WHERE itemid = '+ str(i) +' and hadm_id in (select * from admission_ids) group by valueuom')
    outputunits = cur.fetchall()
    outputunits = sorted(outputunits, key=lambda tup: tup[1])
    outputunits.reverse()
    
    cur = conn.cursor()
    cur.execute('SELECT count(*) FROM mimiciii.labevents WHERE itemid = '+ str(i) +' and hadm_id in (select * from admission_ids) and valuenum is null')
    notnum = cur.fetchone()
    
    cur = conn.cursor()
    cur.execute('SELECT count(*) FROM mimiciii.labevents WHERE itemid = '+ str(i) +' and hadm_id in (select * from admission_ids) and valuenum is not null')
    total = cur.fetchone()
    
    return (i, outputunits, notnum, total)

p = Pool(num_workers)
results = [p.apply_async(stat_lab_unit_task, args=(i, admission_ids_txt)) for i in lab_itemid]
results = [x.get() for x in results]
np.save('res/filtered_lab_raw.npy', {'raw': results})
print('saved!')

saved!


In [5]:
valid_lab = []
valid_lab_unit = []
dropped_id = []
multiple_units = []
results = np.load('res/filtered_lab_raw.npy', allow_pickle=True).tolist()['raw']
for x in results:
    i, outputunits, notnum, total = x[0], x[1], x[2], x[3]
    totaltemp = 0
    unitnum = 0
    for o in outputunits:
        totaltemp += o[1]
        if o[0] is not '':
            unitnum += 1
    if(totaltemp == 0 ): 
        print("LAB "+str(i) + "\t 0 units")
        continue
    percentage = float(outputunits[0][1]) / totaltemp *100.
    
    print("LAB "+ str(i) + "\t" + "{:.2f}%".format(percentage) + "\t" + str(outputunits[0][1])+"/"+str(totaltemp)+" : "+ str(outputunits))
    if unitnum > 1:
        multiple_units.append((i, percentage, totaltemp, outputunits))
    
    percentage =float(total[0])*100 / (notnum[0]+total[0])
    print("LAB\t\t" + "{:.4f}".format(percentage)+ "\tNOTNUM=\t" + str(notnum[0]) + "\tNUM=\t" + str(total[0]))
    
    if(percentage < 95): 
        dropped_id.append(i);
        print('dropped\n')
        continue;
        
    print('')
    valid_lab.append(i);
    valid_lab_unit.append(outputunits[0][0])

LAB 51029	100.00%	9/9 : [('mg/dL', 9)]
LAB		100.0000	NOTNUM=	0	NUM=	9

LAB 50903	99.97%	6668/6670 : [('Ratio', 6668), ('', 2)]
LAB		100.0000	NOTNUM=	0	NUM=	6670

LAB 50800	100.00%	398697/398697 : [('', 398697)]
LAB		0.0000	NOTNUM=	398697	NUM=	0
dropped

LAB 51438	100.00%	1176/1176 : [('#/uL', 1176)]
LAB		99.8299	NOTNUM=	2	NUM=	1174

LAB 51183	100.00%	165/165 : [('', 165)]
LAB		0.0000	NOTNUM=	165	NUM=	0
dropped

LAB 51387	100.00%	71/71 : [('%', 71)]
LAB		100.0000	NOTNUM=	0	NUM=	71

LAB 51506	100.00%	66052/66052 : [('', 66052)]
LAB		0.0000	NOTNUM=	66052	NUM=	0
dropped

LAB 50816	56.03%	71189/127046 : [('%', 71189), ('', 55857)]
LAB		99.8898	NOTNUM=	140	NUM=	126906

LAB 50935	90.80%	8491/9351 : [('mg/dL', 8491), ('MG/DL', 860)]
LAB		86.5897	NOTNUM=	1254	NUM=	8097
dropped

LAB 51132	100.00%	482/482 : [('#/uL', 482)]
LAB		100.0000	NOTNUM=	0	NUM=	482

LAB 50868	100.00%	608346/608366 : [('mEq/L', 608346), ('', 20)]
LAB		99.9988	NOTNUM=	7	NUM=	608359

LAB 51337	100.00%	130/130 : [('', 130)]
LA

In [6]:
print(valid_lab, valid_lab_unit)
np.save('res/filtered_lab.npy',{'id':valid_lab,'unit':valid_lab_unit})
print('saved!')

[51029, 50903, 51438, 51387, 50816, 51132, 50868, 51379, 50832, 51105, 51453, 51436, 51084, 51175, 51053, 50954, 51496, 51037, 50983, 51282, 51353, 51119, 50853, 51209, 51124, 50998, 51146, 51381, 51010, 51366, 51383, 51110, 51212, 51363, 51131, 51036, 51102, 51433, 51031, 50801, 51458, 50803, 50896, 51133, 51073, 51048, 51097, 50815, 50852, 51077, 51045, 51442, 51130, 51065, 51255, 51006, 50830, 50931, 51439, 50842, 51300, 50965, 51218, 51344, 50848, 50969, 51277, 51050, 50949, 51063, 51126, 50966, 51302, 51021, 51208, 51140, 51224, 51272, 51088, 50843, 51202, 50861, 51249, 51360, 51289, 51169, 51099, 51427, 50840, 50890, 51066, 51297, 51058, 51113, 51452, 51226, 50988, 51457, 51437, 51449, 50846, 50995, 51227, 50973, 50851, 51044, 51448, 50902, 51143, 50964, 50810, 50882, 50867, 51456, 51040, 50818, 51347, 51020, 51039, 51068, 51221, 51128, 51385, 50910, 50809, 50967, 50914, 51034, 51042, 51529, 50953, 51354, 51163, 50819, 50806, 51112, 51106, 51023, 51298, 50978, 51441, 51356, 50883

All the units are convertible, so keep all of them.

In [7]:
for i, percentage, totaltemp, outputunits in sorted(multiple_units, key=lambda x: x[1]):
    print("LAB "+ str(i) + "\t" + "{:.2f}%".format(percentage) + "\t" + str(outputunits[0][1])+"/"+str(totaltemp)+" : "+ str(outputunits))

LAB 50990	54.45%	104/191 : [('ug/ml', 104), ('ug/mL', 87)]
LAB 51464	60.07%	39592/65911 : [('EU/dL', 39592), ('mg/dL', 26315), ('', 4)]
LAB 51514	64.92%	44172/68043 : [('mg/dL', 44172), ('EU/dL', 23867), ('', 4)]
LAB 51105	68.65%	289/421 : [('mg/dL', 289), ('MG/DL', 132)]
LAB 50926	75.86%	66/87 : [('mIU/mL', 66), ('mIU/L', 21)]
LAB 50958	76.32%	58/76 : [('mIU/mL', 58), ('mIU/L', 18)]
LAB 50916	81.82%	9/11 : [('ug/dL', 9), ('nG/mL', 2)]
LAB 50889	83.34%	2931/3517 : [('mg/L', 2931), ('mg/dL', 565), ('MG/DL', 19), ('', 2)]
LAB 51000	85.67%	11588/13526 : [('mg/dL', 11588), ('MG/DL', 1936), ('', 2)]
LAB 51003	86.29%	57280/66384 : [('ng/mL', 57280), ('ng/ml', 9104)]
LAB 50989	86.96%	20/23 : [('pg/mL', 20), ('ng/dL', 3)]
LAB 51127	88.43%	2156/2438 : [('#/uL', 2156), ('#/CU MM', 282)]
LAB 51128	88.72%	2218/2500 : [('#/uL', 2218), ('#/CU MM', 282)]
LAB 50995	88.89%	3280/3690 : [('ng/dL', 3280), ('ng/dl', 408), ('', 2)]
LAB 50993	89.25%	13178/14766 : [('uIU/mL', 13178), ('uU/ML', 1586), ('', 2)]

In [8]:
print(dropped_id, len(dropped_id))

[50800, 51183, 51506, 50935, 51337, 51307, 51486, 51502, 51306, 50874, 51075, 51462, 51240, 51426, 51518, 51155, 50864, 51468, 51157, 51466, 50968, 51327, 50957, 51236, 51407, 51230, 50924, 51280, 50942, 51234, 51489, 50939, 50829, 51286, 50901, 50906, 51331, 51405, 51233, 51372, 51011, 51517, 51092, 51309, 51304, 51315, 51519, 50937, 51085, 51412, 51425, 51027, 51268, 50854, 51192, 50959, 51100, 51156, 51487, 51293, 50932, 51416, 51314, 50856, 51463, 50886, 51411, 51332, 51252, 51003, 50880, 50929, 51393, 51154, 51161, 51413, 51421, 51414, 51184, 51342, 51182, 51188, 51318, 51041, 51396, 51330, 51333, 51074, 51189, 51187, 51238, 51090, 51537, 51180, 51478, 51216, 50877, 51323, 51329, 51178, 51477, 50989, 51499, 51179, 51465, 51321, 51076, 51473, 51500, 51215, 51064, 50921, 50938, 50835, 50879, 51420, 51056, 51173, 51194, 50827, 50999, 51400, 50991, 51325, 51312, 51091, 51291, 51258, 51115, 51373, 51507, 51474, 51482, 51247, 50926, 50955, 50933, 51239, 51071, 51079, 51408, 51516, 50946

In [9]:
# valid_lab_num = [51475,50845,51280,50935,51479,50922,51501,50925,50856,50981,51213,50915,51497,51046,50835,51176,51180,51194,51196,51130,51131,51132,50906,51076,51422,51517,51229,50946,51471,50899,51515,51369,50992,50958,50926,50961,51228,50877,51494,50990,50991,51489,51488,51061,51225,50894,50989,51209,51516,51493,51476,50911,51003,51482]
# valid_lab_cate = [51519,51461,51495,51390,51096,51403,51391,51189,51405,50901,51407,51089,51153,51171,51220,51179,51468,51016,51401,51207,51472,51394,51291,51500,51161,51286,51410,51417,51389,51195,51322,51201,51142,51311,51135,51329,51396,51318,51421,50875,51485,51316,51308,51056,51537,51414,51304,51157,51079,51075,51071,51074,51092,51090,51512,50828,50933,51266,51246,51267,51137,51252,51268,51233,50955,50887,51523,51462,50979,51260,50919,51287,51296,51151,51474,51107,51103,51236,50940,50943,50941,51145,51294,50942,51240,51292,51518,50873,51505,51469,50975,51424,51134,51411,50944,50937,51197,51425,51426,51098,51243,51373,51147,51085,51216,51400,51388,51412,50872,51150,51423,51402,50938,50939,51234,51420,51338,51325,51183,51164,50948,51313,50857,51399,51239,51238,51416,51319,51230,51337,51152,51168,51184,51341,51340,51198,51326,51303,51315,50876,51261,51499,50871,51086,51192,51167,51332,51314,51342,51231,51321,51264,51374,51370,51372,51307,51235,51215,51317,51503,51335,51172,50874,51219,51305,51310,51323,51334,51320,51178,51156,51309,51306,51155,51324,51177,51158,51328,50913,51336,51159,51333,51154,51331,51262,51339,51182,51217,50918,51193,51460,51191,51510,51091,51465,51017,51160,51408,51418,51190,51187,51413,50932,51511,51350,51393,51481,51392,51490,51397,51470,51247,51312,51395,51295,51173,51502,51415,51508,51506,51466,51486,51464,51487,51463,50920,50800,50880,50879,50999,50812]
# valid_lab_set = set(valid_lab_num + valid_lab_cate)
# leftids = [d for d in dropped_id if d not in valid_lab_set]

dropped_value = []

for d in dropped_id:
# for d in leftids:
    print('LAB : ' + str(d))
    cur = conn.cursor()
    cur.execute('SELECT value, valueuom, count(*) as x FROM mimiciii.labevents as lb \
                WHERE itemid = '+ str(d) +' and hadm_id in (select * from admission_ids) GROUP BY value, valueuom ORDER BY x DESC')
    droped_outs = cur.fetchall()
    drop_array = [];
    ct =0
    total = 0;
    for dx in droped_outs:
        total += dx[2]
    print("Count ",total)
    for dx in droped_outs:
        ct+=1
        if(ct>20):
            break;
        dx = list(dx)
        print(dx[1],dx[0],"\t",dx[2])
    print('')
    dropped_value.append((d, droped_outs))
    
np.save('res/lab_dropped_value.npy', dropped_value)

LAB : 50800
Count  398697
None ART 	 353584
None MIX 	 27174
None VEN 	 17171
None CENTRAL VENOUS 	 768

LAB : 51183
Count  165
None DONE 	 133
None D 	 31
None DOME 	 1

LAB : 51506
Count  66052
None Clear 	 46682
None Hazy 	 8907
None Cloudy 	 5506
None SlHazy 	 2256
None CLEAR 	 1050
None SlCldy 	 562
None SLHAZY 	 285
None HAZY 	 273
None CLOUDY 	 235
None SlCloudy 	 134
None Slcldy 	 72
None TURBID 	 38
None   	 17
None Sl 	 13
None CLO 	 4
None COMPUTER NETWORK FAILURE. TEST NOT RESULTED. 	 2
None HA 	 2
None CLOU 	 2
None H 	 2
None RED 	 1

LAB : 50935
Count  9351
mg/dL <20 	 1098
MG/DL <20 	 105
mg/dL 125 	 35
mg/dL 44 	 34
mg/dL 195 	 34
mg/dL LESS THAN 20 	 34
mg/dL 110 	 33
mg/dL 50 	 33
mg/dL 108 	 33
mg/dL 139 	 33
mg/dL 140 	 32
mg/dL 54 	 31
mg/dL 113 	 31
mg/dL 175 	 31
mg/dL 142 	 31
mg/dL 88 	 31
mg/dL 160 	 30
mg/dL 213 	 30
mg/dL 96 	 30
mg/dL 106 	 30

LAB : 51337
Count  130
None DONE 	 106
None D 	 24

LAB : 51307
Count  91
None DONE 	 75
None D 	 16

LAB : 51486

Count  31235
None NORMAL 	 14579
None 1+ 	 8127
None OCCASIONAL 	 5592
None 2+ 	 2569
None 3+ 	 368

LAB : 50854
Count  1172
None DONE 	 1171
None 0.45 	 1

LAB : 51192
Count  113
None DONE 	 96
None D 	 17

LAB : 50959
Count  2
None PROLACTIN ELEVATED, SO SAMPLE WAS TREATED WITH PEG, WHICH SHOWED 	 2

LAB : 51100
Count  21884
mEq/L LESS THAN 10 	 1706
mEq/L 10 	 483
mEq/L 11 	 429
mEq/L 12 	 390
mEq/L 13 	 370
mEq/L 15 	 355
mEq/L 14 	 343
mEq/L 18 	 307
mEq/L 16 	 302
mEq/L 19 	 301
mEq/L 22 	 297
mEq/L 17 	 292
mEq/L <10 	 283
mEq/L 20 	 281
mEq/L 23 	 281
mEq/L 24 	 280
mEq/L 21 	 271
mEq/L 25 	 263
mEq/L 26 	 258
mEq/L 32 	 247

LAB : 51156
Count  64
None DONE 	 58
None D 	 6

LAB : 51487
Count  65904
None NEG 	 62290
None POS 	 3585
None   	 17
None Neg 	 6
None N 	 3
None COMPUTER NETWORK FAILURE. TEST NOT RESULTED. 	 2
None P 	 1

LAB : 51293
Count  2
None NEGATIVE 	 2

LAB : 50932
Count  79
None HOLD 	 79

LAB : 51416
Count  180
None DONE 	 155
None D 	 25

LAB : 51314
Count  

Count  76995
None /20 	 3081
None 20/ 	 2683
None 14/ 	 2645
None 8/ 	 2571
None 16/ 	 2365
None 12/ 	 2296
None 18/ 	 2284
None /18 	 2059
None /22 	 1984
None /24 	 1950
None /16 	 1916
None /12 	 1735
None /14 	 1685
None 22/ 	 1499
None 24/ 	 1456
None /30 	 1332
None /28 	 1276
None /10 	 1270
None 10/ 	 1216
None /25 	 1123

LAB : 50999
Count  9028
None NEG 	 8859
None POS 	 159
None NEGATIVE 	 8
None ERROR 	 2

LAB : 51400
Count  256
None DONE 	 215
None D 	 41

LAB : 50991
Count  50
ng/mL 4 	 4
ng/mL LESS THAN 0.2 	 3
ng/mL UNABLE TO REPORT DUE TO PRESENCE OF ANTIBODIES TO THYROGLOBULIN 	 3
ng/mL 8 	 2
ng/mL 3 	 2
ng/mL 11 	 2
ng/mL 12 	 1
ng/mL 6 	 1
ng/mL 55 	 1
ng/mL 1990 	 1
ng/mL 1530 	 1
ng/mL 128 	 1
ng/mL 2.9 	 1
ng/mL 447 	 1
ng/mL 90 	 1
ng/mL 324 	 1
ng/mL 29 	 1
ng/mL 834 	 1
ng/mL 27 	 1
ng/mL 13 	 1

LAB : 51325
Count  174
None DONE 	 138
None D 	 36

LAB : 51312
Count  10
None DONE 	 10

LAB : 51091
Count  267
None PRESUMPTIVELY NEGATIVE 	 205
None PRESUMPTIVELY 

Count  24626
None See Comments 	 24626

LAB : 51316
Count  2
None DONE 	 2

LAB : 51406
Count  124
None DONE 	 33
#/uL DONE 	 31
#/uL D 	 12
#/uL 24 	 2
#/uL 103 	 1
#/uL 795 	 1
#/uL 852 	 1
#/uL 1346 	 1
#/uL 5166 	 1
#/uL 1026 	 1
#/uL 1263 	 1
#/uL 33 	 1
#/uL 17 	 1
#/uL 2249 	 1
#/uL 914 	 1
#/uL 70.4 	 1
#/uL 1297 	 1
#/uL 558 	 1
#/uL 9.2 	 1
#/uL 64.4 	 1

LAB : 50958
Count  76
mIU/mL <1.0 	 16
mIU/L <1.0 	 9
mIU/mL 16 	 3
mIU/mL 15 	 3
mIU/mL 6.8 	 3
mIU/mL 2.2 	 3
mIU/L 1.4 	 2
mIU/mL 4.8 	 2
mIU/mL 4.7 	 2
mIU/mL 13 	 2
mIU/mL 3.5 	 2
mIU/L 2.8 	 2
mIU/mL 10 	 2
mIU/L 19 	 1
mIU/mL 3.9 	 1
mIU/mL 9.1 	 1
mIU/L 39 	 1
mIU/mL 8.0 	 1
mIU/L 1.1 	 1
mIU/mL 7.5 	 1

LAB : 51172
Count  92
None DONE 	 75
None D 	 17

LAB : 51394
Count  14
None DONE 	 13
None D 	 1

LAB : 51086
Count  421
None NO MONOCLONAL IMMUNOGLOBULIN SEEN 	 307
None NEGATIVE FOR BENCE-JONES PROTEIN 	 40
None MONOCLONAL FREE (BENCE-JONES) KAPPA DETECTED 	 9
None MONOCLONAL FREE (BENCE-JONES) LAMBDA DETECTED 	 4

Count  60883
#/hpf 0-2 	 11115
#/hpf 0 	 8033
#/hpf 3-5 	 5751
#/hpf >50 	 4326
#/hpf 6-10 	 3594
#/hpf 1 	 3249
#/hpf 11-20 	 2763
#/hpf 2 	 2753
#/hpf 21-50 	 2330
#/hpf 3 	 1711
#/hpf 5 	 1047
#/hpf 6 	 772
#/hpf 4 	 725
#/hpf >1000 	 656
#/hpf <1 	 623
#/hpf 8 	 593
#/hpf 9 	 459
#/hpf 7 	 454
#/hpf 11 	 400
#/hpf 12 	 333

LAB : 50981
Count  6385
mg/dL NEG 	 6024
mg/dL 4 	 50
mg/dL 5 	 20
mg/dL 6 	 15
mg/dL NEGATIVE 	 13
mg/dL 9 	 12
mg/dL 7 	 8
mg/dL 17 	 6
mg/dL 31 	 6
mg/dL 28 	 6
mg/dL 11 	 6
mg/dL 23 	 6
mg/dL 19 	 5
mg/dL 15 	 5
mg/dL 26 	 5
mg/dL 36 	 5
mg/dL 33 	 5
mg/dL 8 	 5
mg/dL 16 	 4
mg/dL 41 	 4

LAB : 51481
Count  108
None NEGATIVE 	 90
None POSITIVE 	 18

LAB : 51472
Count  4
None OCC 	 2
None MOD 	 1
None FEW 	 1

LAB : 51492
Count  72632
mg/dL NEG 	 28978
mg/dL 30 	 15859
mg/dL TR 	 14181
mg/dL 100 	 6353
mg/dL 25 	 2007
mg/dL 500 	 1685
mg/dL 75 	 1316
mg/dL >300 	 1249
mg/dL 300 	 465
mg/dL 150 	 400
mg/dL 600 	 63
mg/dL   	 37
mg/dL >600 	 16
mg/dL 15 	 12
mg

Count  7083
None 1+ 	 3088
None OCCASIONAL 	 2572
None 2+ 	 1213
None 3+ 	 194
None NORMAL 	 16

LAB : 51485
Count  2
None RARE 	 1
None FEW 	 1

LAB : 50911
Count  89879
ng/mL NotDone 	 23833
ng/mL 3 	 9027
ng/mL 2 	 7723
ng/mL 4 	 7253
ng/mL 5 	 5255
ng/mL 6 	 4094
ng/mL 7 	 3088
ng/mL 8 	 2511
ng/mL 1 	 2298
ng/mL 9 	 1928
ng/mL 10 	 1610
ng/mL 11 	 1312
ng/mL 12 	 1196
ng/mL 13 	 985
ng/mL 14 	 910
ng/mL 15 	 787
ng/mL 16 	 651
ng/mL 17 	 646
ng/mL 18 	 561
ng/mL None 	 515

LAB : 51424
Count  401
None DONE 	 303
None D 	 84
None CANCEL 	 13
None NOT DONE 	 1

LAB : 51423
Count  225
None DONE 	 192
None D 	 33

LAB : 50979
Count  11033
None HOLD 	 11023
None DONE 	 8
None DONE-NC 	 1
None DISCARDED 	 1

LAB : 51150
Count  404
None NEGATIVE 	 184
None NEG 	 96
None POSITIVE 	 57
None POS 	 25
None NEGATIVE THIN AND THICK SMEAR REVIEWED 	 3
None NEGATIVE THICK SMEAR 	 3
None NEGATIVE FOR THICK AND THIN SMEARS 	 2
None NEG FOR THICK SMEAR 	 2
None THICK SMEAR NEGATIVE 	 2
None NEGATIV

Count  67847
mg/dL NEG 	 49136
mg/dL TR 	 9581
mg/dL 15 	 3406
mg/dL 50 	 1528
mg/dL 10 	 1517
mg/dL 150 	 1383
mg/dL 40 	 894
mg/dL 80 	 213
mg/dL >80 	 156
mg/dL   	 19
None NEG 	 4
mg/dL Neg 	 4
mg/dL Tr 	 2
mg/dL COMPUTER NETWORK FAILURE. TEST NOT RESULTED. 	 2
mg/dL T 	 1
mg/dL N 	 1

LAB : 51475
Count  2
#/lpf <1 	 2

LAB : 50837
Count  39
mEq/L 20 	 4
mEq/L 18 	 3
mEq/L 29 	 3
mEq/L 22 	 3
mEq/L 16 	 2
mEq/L 21 	 2
mEq/L 24 	 2
mEq/L 19 	 2
mEq/L 27 	 2
mEq/L 13 	 2
mEq/L 23 	 2
mEq/L LESS THAN 5 	 2
mEq/L 8 	 1
mEq/L 26 	 1
mEq/L 10 	 1
mEq/L 14 	 1
mEq/L LESS THAN 10 	 1
mEq/L 37 	 1
mEq/L 25 	 1
mEq/L 15 	 1

LAB : 51213
Count  4821
ug/mL 10-40 	 2052
ug/mL 0-10 	 1051
ug/mL 40-80 	 751
ug/mL 80-160 	 453
ug/mL 160-320 	 262
ug/mL 320-640 	 125
ug/mL 640-1280 	 71
ug/mL >1280 	 55
ug/mL None 	 1

LAB : 51494
Count  46
#/lpf 0-2 	 17
#/lpf <1 	 16
#/lpf 3 	 5
#/lpf 1 	 3
#/lpf 3-5 	 2
#/lpf 127 	 1
#/lpf 13 	 1
#/lpf 5 	 1

LAB : 50876
Count  246
None NEGATIVE 	 163
None POSIT

In [10]:
dropped_value = np.load('res/lab_dropped_value.npy', allow_pickle=True).tolist()
valid_lab_num = []
valid_lab_num_unit = []
valid_lab_cate = []
valid_lab_ratio = []
for d, droped_outs in dropped_value:
    ascnum = 0
    rationum = 0
    for value, valueuom, count in droped_outs:
        value = str(value)
        isasc = re.search(r'(\d+\.\d*)|(\d*\.\d+)|(\d+)', value) is None
        isratio = re.fullmatch(r'{0}\/{0}'.format(r'((\d+\.\d*)|(\d*\.\d+)|(\d+))'), value) is not None
        if isasc:
            ascnum += 1
        if isratio:
            rationum += 1
    if ascnum / len(droped_outs) >= 0.5:
        valid_lab_cate.append(d)
    elif rationum / len(droped_outs) >= 0.5:
        valid_lab_ratio.append(d)
        print(droped_outs)
    else:
        valid_lab_num.append(d)
        if droped_outs[0][1] is None:
            valid_lab_num_unit.append('')
        else:
            valid_lab_num_unit.append(droped_outs[0][1])
#         print(droped_outs)
        
print(len(valid_lab_num), len(valid_lab_cate), len(valid_lab_ratio))
# print(valid_lab_num, valid_lab_num_unit)

[('/20', None, 3081), ('20/', None, 2683), ('14/', None, 2645), ('8/', None, 2571), ('16/', None, 2365), ('12/', None, 2296), ('18/', None, 2284), ('/18', None, 2059), ('/22', None, 1984), ('/24', None, 1950), ('/16', None, 1916), ('/12', None, 1735), ('/14', None, 1685), ('22/', None, 1499), ('24/', None, 1456), ('/30', None, 1332), ('/28', None, 1276), ('/10', None, 1270), ('10/', None, 1216), ('/25', None, 1123), ('30/', None, 1111), ('/26', None, 1072), ('26/', None, 1056), ('/15', None, 1017), ('28/', None, 1000), ('/8', None, 913), ('/23', None, 824), ('/17', None, 802), ('/21', None, 790), ('/19', None, 679), ('25/', None, 645), ('/32', None, 552), ('15/', None, 529), ('14/0', None, 523), ('/27', None, 511), ('32/', None, 488), ('20/0', None, 480), ('16/0', None, 458), ('/13', None, 447), ('/11', None, 408), ('35/', None, 403), ('12/0', None, 399), ('18/0', None, 381), ('34/', None, 329), ('/40', None, 315), ('14/2', None, 302), ('/9', None, 300), ('24/0', None, 297), ('/29', No

In [11]:
# Here we directly do some manual selection to get the list of valid_lab_num and valid_lab_cat

In [12]:
print(valid_lab_cate)

[50800, 51183, 51506, 51337, 51307, 51486, 51502, 51306, 51075, 51462, 51426, 51518, 51155, 51468, 51157, 51466, 51327, 51236, 51407, 51230, 50942, 51234, 50939, 50829, 51286, 50901, 51331, 51405, 51372, 51092, 51309, 51304, 51315, 51519, 50937, 51085, 51412, 51425, 50854, 51192, 50959, 51156, 51487, 51293, 50932, 51416, 51314, 51463, 50886, 51411, 51332, 50880, 51393, 51154, 51161, 51413, 51421, 51414, 51342, 51182, 51318, 51396, 51330, 51333, 51074, 51189, 51187, 51238, 51090, 51537, 51323, 51329, 51178, 51477, 51499, 51179, 51465, 51321, 51473, 51500, 51215, 50938, 50879, 51420, 51173, 50999, 51400, 51325, 51312, 51091, 51258, 51373, 51474, 51247, 50955, 50933, 51239, 51071, 51079, 51408, 51158, 51470, 51350, 51523, 51278, 51520, 51220, 51339, 51490, 51512, 51460, 50873, 51391, 51511, 51135, 51308, 50871, 51370, 51171, 50920, 51316, 51172, 51394, 51086, 50828, 51399, 51193, 51335, 51098, 51402, 51303, 50948, 50855, 51311, 51392, 50919, 51319, 51326, 51403, 51219, 51334, 51242, 51159

In [13]:
print(len(valid_lab_num), len(valid_lab_num_unit), len(valid_lab_cate))
np.save('res/filtered_lab_num',{'id':valid_lab_num,'unit':valid_lab_num_unit})
np.save('res/filtered_lab_cate',{'id':valid_lab_cate,'unit':None})
np.save('res/filtered_lab_ratio', {'id': valid_lab_ratio, 'unit': None})

113 113 224
