# Filter Itemid Chart

This script is used for filtering itemids from TABLE CHARTEVENTS.

1. We check number of units of each itemid and choose the major unit as the target of unit conversion.
2. In this step we get 3 kinds of features:
    - numerical features
    - categorical features
    - ratio features, this usually happens in blood pressure measurement, such as "135/70".

## Output

1. itemid of observations for chartevents.
2. unit of measurement for each itemid.

In [1]:
from __future__ import print_function

import psycopg2
import datetime
import sys
from operator import itemgetter, attrgetter, methodcaller
import numpy as np
import itertools
import os.path
import matplotlib.pyplot as plt
import math
from multiprocessing import Pool, cpu_count
import re
from tqdm import tqdm_notebook as tqdm
from utils import getConnection

%matplotlib inline

In [2]:
conn = getConnection()
    
_adm = np.load('res/admission_ids.npy', allow_pickle=True).tolist()
admission_ids = _adm['admission_ids']
admission_ids_txt = _adm['admission_ids_txt']

db = np.load('res/itemids.npy', allow_pickle=True).tolist()
input_itemid = db['input']
output_itemid = db['output']
chart_itemid = db['chart']
lab_itemid = db['lab']
microbio_itemid = db['microbio']
prescript_itemid = db['prescript']

In [3]:
def stat_chart_unit_task(ilist, admission_ids_txt):
    subresults = []
    tconn = getConnection()
    
    for i in tqdm(ilist):
        # for each itemID select number of rows group by unit of measurement.
        tcur = tconn.cursor()
        tcur.execute('SELECT coalesce(valueuom, \'\'), count(*) FROM mimiciii.chartevents WHERE itemid = '+ str(i) +' and hadm_id in (select * from admission_ids) group by valueuom')
        chartunits = tcur.fetchall()
        chartunits = sorted(chartunits, key=lambda tup: tup[1])
        chartunits.reverse()

        # count number of observation that has non numeric value
        tcur = tconn.cursor()
        tcur.execute('SELECT count(*) FROM mimiciii.chartevents WHERE itemid = '+ str(i) +' and hadm_id in (select * from admission_ids) and valuenum is null')
        notnum = tcur.fetchone()
        notnum = notnum[0]

        # total number of observation
        tcur = tconn.cursor()
        tcur.execute('SELECT count(*) FROM mimiciii.chartevents WHERE itemid = '+ str(i) +' and hadm_id in (select * from admission_ids)')
        total = tcur.fetchone()
        total = total[0]
        
        subresults.append((i, chartunits, notnum, total))
    
    tconn.close()
    return subresults

# numworkers = cpu_count() // 2
numworkers = 4
p = Pool(numworkers)
ilists = np.array_split(chart_itemid, numworkers)
results = [p.apply_async(stat_chart_unit_task, args=(ilist, admission_ids_txt)) for ilist in ilists]
p.close()
p.join()
results = [x.get() for x in results]
results = itertools.chain.from_iterable(results)
# results = []
# for i in tqdm(chart_itemid):
#     result = stat_chart_unit_task(i, admission_ids_txt)
#     results.append(result)
np.save('res/filtered_chart_raw.npy', {'raw': results})
print('saved!')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=1616.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1615.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1616.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1616.0), HTML(value='')))





saved!


## First filtering of categorical features

All features with numerical values < 80% of all records are possible categorical features. In this step we drop them for later analyzing.

In [4]:
results = np.load('res/filtered_chart_raw.npy', allow_pickle=True).tolist()['raw']
valid_chart = []
valid_chart_unit = []
valid_chart_cate = []
valid_chart_num = []
dropped = []
multiple_units = []
for x in results:
    i, chartunits, notnum, total = x[0], x[1], x[2], x[3]
    
    # calculate percentage of the top frequent unit compared to all observation.
    total2 = 0
    unitnum = 0
    for c in chartunits:
        total2 += c[1]
        if c[0] != '':
            unitnum += 1
    if total2 == 0:
        continue
    percentage = float(chartunits[0][1]) / total2 * 100.
    if unitnum > 1:
        multiple_units.append((i, chartunits, percentage))
    print("CHART "+str(i) + "\t" + "{:.2f}".format(percentage) +'\t'+ str(chartunits))
    
    # if the percentage of numeric number is less, then dropped it, and make it categorical feature.
    percentage =float(total -notnum)*100 / total
    print("Numeric observation :" + "{:.4f}%".format(percentage)+ " ( NOTNUM= " + str(notnum) + " / ALL= " + str(total) + " ) ")
    if(percentage < 80): 
        print('dropped\n')
        dropped.append(i)
        continue;
    print('')
    valid_chart.append(i)
    valid_chart_unit.append(chartunits[0][0])

CHART 2848	100.00	[('', 32)]
Numeric observation :0.0000% ( NOTNUM= 32 / ALL= 32 ) 
dropped

CHART 3565	100.00	[('', 59508)]
Numeric observation :0.0000% ( NOTNUM= 59508 / ALL= 59508 ) 
dropped

CHART 2026	100.00	[('', 158)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 158 ) 

CHART 6158	100.00	[('', 1)]
Numeric observation :0.0000% ( NOTNUM= 1 / ALL= 1 ) 
dropped

CHART 3028	100.00	[('', 1)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 1 ) 

CHART 2409	100.00	[('', 48)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 48 ) 

CHART 225771	100.00	[('', 519)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 519 ) 

CHART 4349	100.00	[('', 36)]
Numeric observation :0.0000% ( NOTNUM= 36 / ALL= 36 ) 
dropped

CHART 4718	100.00	[('', 53)]
Numeric observation :0.0000% ( NOTNUM= 53 / ALL= 53 ) 
dropped

CHART 3370	100.00	[('', 38505)]
Numeric observation :0.0000% ( NOTNUM= 38505 / ALL= 38505 ) 
dropped

CHART 4466	100.00	[('', 11)]
Numeric observation :0.0000% ( NOTNUM= 11 / ALL= 1


CHART 981	100.00	[('', 26)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 26 ) 

CHART 7081	100.00	[('', 9)]
Numeric observation :0.0000% ( NOTNUM= 9 / ALL= 9 ) 
dropped

CHART 224392	100.00	[('', 501)]
Numeric observation :0.0000% ( NOTNUM= 501 / ALL= 501 ) 
dropped

CHART 224940	100.00	[('', 31637)]
Numeric observation :0.0000% ( NOTNUM= 31637 / ALL= 31637 ) 
dropped

CHART 3584	100.00	[('', 454)]
Numeric observation :0.0000% ( NOTNUM= 454 / ALL= 454 ) 
dropped

CHART 4187	100.00	[('cm', 148995)]
Numeric observation :99.9644% ( NOTNUM= 53 / ALL= 148995 ) 

CHART 7263	100.00	[('', 5)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 5 ) 

CHART 668	100.00	[('', 8498)]
Numeric observation :0.0000% ( NOTNUM= 8498 / ALL= 8498 ) 
dropped

CHART 4326	100.00	[('', 65)]
Numeric observation :0.0000% ( NOTNUM= 65 / ALL= 65 ) 
dropped

CHART 227586	100.00	[('', 863)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 863 ) 

CHART 227619	100.00	[('', 1288)]
Numeric observation :100.0000% (

CHART 199	100.00	[('', 8129)]
Numeric observation :0.0000% ( NOTNUM= 8129 / ALL= 8129 ) 
dropped

CHART 5942	100.00	[('', 7)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 7 ) 

CHART 2861	100.00	[('', 10)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 10 ) 

CHART 3643	100.00	[('', 12)]
Numeric observation :0.0000% ( NOTNUM= 12 / ALL= 12 ) 
dropped

CHART 224892	100.00	[('', 12957)]
Numeric observation :0.0000% ( NOTNUM= 12957 / ALL= 12957 ) 
dropped

CHART 3100	100.00	[('', 4)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 4 ) 

CHART 6481	100.00	[('', 1)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 1 ) 

CHART 6294	100.00	[('', 2)]
Numeric observation :0.0000% ( NOTNUM= 2 / ALL= 2 ) 
dropped

CHART 2106	100.00	[('', 76)]
Numeric observation :97.3684% ( NOTNUM= 2 / ALL= 76 ) 

CHART 2094	100.00	[('', 14)]
Numeric observation :92.8571% ( NOTNUM= 1 / ALL= 14 ) 

CHART 671	96.70	[('sec', 10551), ('', 360)]
Numeric observation :96.6548% ( NOTNUM= 365 / ALL= 10911 ) 

CH

dropped

CHART 224153	100.00	[('ml/hr', 60126)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 60126 ) 

CHART 224327	100.00	[('', 23657)]
Numeric observation :0.0000% ( NOTNUM= 23657 / ALL= 23657 ) 
dropped

CHART 82	100.00	[('', 344145)]
Numeric observation :0.0000% ( NOTNUM= 344145 / ALL= 344145 ) 
dropped

CHART 2672	100.00	[('', 36)]
Numeric observation :97.2222% ( NOTNUM= 1 / ALL= 36 ) 

CHART 224643	100.00	[('mmHg', 671)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 671 ) 

CHART 1043	100.00	[('', 255459)]
Numeric observation :0.0000% ( NOTNUM= 255459 / ALL= 255459 ) 
dropped

CHART 1048	100.00	[('', 106)]
Numeric observation :0.0000% ( NOTNUM= 106 / ALL= 106 ) 
dropped

CHART 2308	100.00	[('', 35)]
Numeric observation :5.7143% ( NOTNUM= 33 / ALL= 35 ) 
dropped

CHART 5940	100.00	[('', 3)]
Numeric observation :0.0000% ( NOTNUM= 3 / ALL= 3 ) 
dropped

CHART 225323	100.00	[('', 38910)]
Numeric observation :0.0000% ( NOTNUM= 38910 / ALL= 38910 ) 
dropped

CHART 213	100.00	[

Numeric observation :99.7222% ( NOTNUM= 1 / ALL= 360 ) 

CHART 5950	100.00	[('', 123)]
Numeric observation :99.1870% ( NOTNUM= 1 / ALL= 123 ) 

CHART 8058	100.00	[('', 33)]
Numeric observation :0.0000% ( NOTNUM= 33 / ALL= 33 ) 
dropped

CHART 290	100.00	[('', 462)]
Numeric observation :0.0000% ( NOTNUM= 462 / ALL= 462 ) 
dropped

CHART 4944	100.00	[('', 2)]
Numeric observation :0.0000% ( NOTNUM= 2 / ALL= 2 ) 
dropped

CHART 6750	100.00	[('', 8)]
Numeric observation :0.0000% ( NOTNUM= 8 / ALL= 8 ) 
dropped

CHART 7026	100.00	[('', 2)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 2 ) 

CHART 793	100.00	[('', 2164)]
Numeric observation :57.5323% ( NOTNUM= 919 / ALL= 2164 ) 
dropped

CHART 224831	100.00	[('', 44244)]
Numeric observation :0.0000% ( NOTNUM= 44244 / ALL= 44244 ) 
dropped

CHART 225634	99.91	[('IU/L', 26536), ('', 25)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 26561 ) 

CHART 228211	100.00	[('', 92)]
Numeric observation :0.0000% ( NOTNUM= 92 / ALL= 92 ) 
dropped



dropped

CHART 7667	100.00	[('', 52)]
Numeric observation :0.0000% ( NOTNUM= 52 / ALL= 52 ) 
dropped

CHART 1674	100.00	[('', 41)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 41 ) 

CHART 2932	100.00	[('', 11)]
Numeric observation :90.9091% ( NOTNUM= 1 / ALL= 11 ) 

CHART 4990	100.00	[('', 57)]
Numeric observation :0.0000% ( NOTNUM= 57 / ALL= 57 ) 
dropped

CHART 8265	100.00	[('', 21)]
Numeric observation :0.0000% ( NOTNUM= 21 / ALL= 21 ) 
dropped

CHART 1042	100.00	[('', 1)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 1 ) 

CHART 1495	100.00	[('', 82)]
Numeric observation :97.5610% ( NOTNUM= 2 / ALL= 82 ) 

CHART 5781	100.00	[('', 2)]
Numeric observation :0.0000% ( NOTNUM= 2 / ALL= 2 ) 
dropped

CHART 8298	100.00	[('', 47)]
Numeric observation :97.8723% ( NOTNUM= 1 / ALL= 47 ) 

CHART 228290	100.00	[('', 97)]
Numeric observation :0.0000% ( NOTNUM= 97 / ALL= 97 ) 
dropped

CHART 4193	100.00	[('', 57)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 57 ) 

CHART 4676	100.0

Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 2 ) 

CHART 6076	100.00	[('', 4)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 4 ) 

CHART 7204	100.00	[('', 13)]
Numeric observation :0.0000% ( NOTNUM= 13 / ALL= 13 ) 
dropped

CHART 6848	100.00	[('', 22)]
Numeric observation :0.0000% ( NOTNUM= 22 / ALL= 22 ) 
dropped

CHART 224711	100.00	[('sec', 383)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 383 ) 

CHART 226180	100.00	[('', 27621)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 27621 ) 

CHART 6065	100.00	[('', 92)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 92 ) 

CHART 7687	100.00	[('', 31)]
Numeric observation :0.0000% ( NOTNUM= 31 / ALL= 31 ) 
dropped

CHART 224850	100.00	[('', 112435)]
Numeric observation :0.0000% ( NOTNUM= 112435 / ALL= 112435 ) 
dropped

CHART 228185	100.00	[('dynes.sec.cm-5/m2', 1506)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 1506 ) 

CHART 1067	100.00	[('', 3)]
Numeric observation :0.0000% ( NOTNUM= 3 / ALL= 3 ) 
dropped

CH

CHART 3607	100.00	[('', 440057)]
Numeric observation :0.0000% ( NOTNUM= 440057 / ALL= 440057 ) 
dropped

CHART 4844	100.00	[('', 232)]
Numeric observation :0.0000% ( NOTNUM= 232 / ALL= 232 ) 
dropped

CHART 224370	100.00	[('', 173768)]
Numeric observation :0.0000% ( NOTNUM= 173768 / ALL= 173768 ) 
dropped

CHART 6489	100.00	[('', 5)]
Numeric observation :80.0000% ( NOTNUM= 1 / ALL= 5 ) 

CHART 3657	60.46	[('', 367), ('%', 240)]
Numeric observation :39.2092% ( NOTNUM= 369 / ALL= 607 ) 
dropped

CHART 227293	100.00	[('', 144018)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 144018 ) 

CHART 224891	100.00	[('', 20078)]
Numeric observation :0.0000% ( NOTNUM= 20078 / ALL= 20078 ) 
dropped

CHART 1969	100.00	[('', 11)]
Numeric observation :0.0000% ( NOTNUM= 11 / ALL= 11 ) 
dropped

CHART 7809	100.00	[('', 1)]
Numeric observation :0.0000% ( NOTNUM= 1 / ALL= 1 ) 
dropped

CHART 224088	100.00	[('', 405832)]
Numeric observation :0.0000% ( NOTNUM= 405832 / ALL= 405832 ) 
dropped

CHART 4214	


CHART 227591	100.00	[('', 9615)]
Numeric observation :0.0000% ( NOTNUM= 9615 / ALL= 9615 ) 
dropped

CHART 1032	100.00	[('', 179)]
Numeric observation :93.8547% ( NOTNUM= 11 / ALL= 179 ) 

CHART 224922	100.00	[('cm', 2414)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 2414 ) 

CHART 2492	100.00	[('', 95)]
Numeric observation :0.0000% ( NOTNUM= 95 / ALL= 95 ) 
dropped

CHART 7840	100.00	[('', 19)]
Numeric observation :0.0000% ( NOTNUM= 19 / ALL= 19 ) 
dropped

CHART 2482	100.00	[('', 25)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 25 ) 

CHART 5794	100.00	[('', 1)]
Numeric observation :0.0000% ( NOTNUM= 1 / ALL= 1 ) 
dropped

CHART 5282	100.00	[('', 77)]
Numeric observation :0.0000% ( NOTNUM= 77 / ALL= 77 ) 
dropped

CHART 1635	100.00	[('', 8)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 8 ) 

CHART 477	100.00	[('', 456538)]
Numeric observation :0.0000% ( NOTNUM= 456538 / ALL= 456538 ) 
dropped

CHART 2161	100.00	[('', 101)]
Numeric observation :98.0198% ( NOTNUM= 2 /

Numeric observation :0.0000% ( NOTNUM= 18 / ALL= 18 ) 
dropped

CHART 5714	100.00	[('', 23)]
Numeric observation :0.0000% ( NOTNUM= 23 / ALL= 23 ) 
dropped

CHART 4267	100.00	[('', 1051)]
Numeric observation :0.1903% ( NOTNUM= 1049 / ALL= 1051 ) 
dropped

CHART 75	97.26	[('mA', 10341), ('', 291)]
Numeric observation :96.7645% ( NOTNUM= 344 / ALL= 10632 ) 

CHART 227566	100.00	[('', 82115)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 82115 ) 

CHART 3651	93.97	[('mmHg', 265), ('', 17)]
Numeric observation :12.4113% ( NOTNUM= 247 / ALL= 282 ) 
dropped

CHART 561	100.00	[('', 8166)]
Numeric observation :0.0000% ( NOTNUM= 8166 / ALL= 8166 ) 
dropped

CHART 8425	100.00	[('', 5501)]
Numeric observation :0.0000% ( NOTNUM= 5501 / ALL= 5501 ) 
dropped

CHART 224366	100.00	[('ml/hr', 7891)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 7891 ) 

CHART 1044	100.00	[('', 142416)]
Numeric observation :0.0000% ( NOTNUM= 142416 / ALL= 142416 ) 
dropped

CHART 6055	100.00	[('', 1)]
Numeric ob

Numeric observation :0.0000% ( NOTNUM= 5998 / ALL= 5998 ) 
dropped

CHART 227620	100.00	[('', 2763)]
Numeric observation :0.0000% ( NOTNUM= 2763 / ALL= 2763 ) 
dropped

CHART 6631	100.00	[('', 10)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 10 ) 

CHART 2649	100.00	[('', 15)]
Numeric observation :0.0000% ( NOTNUM= 15 / ALL= 15 ) 
dropped

CHART 1747	100.00	[('', 1)]
Numeric observation :0.0000% ( NOTNUM= 1 / ALL= 1 ) 
dropped

CHART 1201	100.00	[('', 7)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 7 ) 

CHART 946	100.00	[('', 262)]
Numeric observation :94.6565% ( NOTNUM= 14 / ALL= 262 ) 

CHART 226117	100.00	[('', 2273)]
Numeric observation :100.0000% ( NOTNUM= 0 / ALL= 2273 ) 

CHART 1647	100.00	[('', 8)]
Numeric observation :0.0000% ( NOTNUM= 8 / ALL= 8 ) 
dropped

CHART 4880	100.00	[('', 19)]
Numeric observation :0.0000% ( NOTNUM= 19 / ALL= 19 ) 
dropped

CHART 1414	100.00	[('', 2)]
Numeric observation :0.0000% ( NOTNUM= 2 / ALL= 2 ) 
dropped

CHART 2072	100.00	[('', 23

## Unit inconsistency

Here are itemids having two or more different units.

For [211, 505], they have the same unit in fact. Keep them.

For [3451, 578, 113], the major unit covers > 90% of all records. Keep them.

For [3723], it is just a typo and we keep all.

In [5]:
for i, chartunits, percentage in sorted(multiple_units, key=lambda x: x[2]):
    total2 = sum([t[1] for t in chartunits])
    percentage = float(chartunits[0][1]) / total2 * 100.
    print("CHART "+str(i) + "\t" + "{:.4f}".format(percentage) +'\t'+ str(chartunits))

CHART 3723	59.3440	[('cm', 258375), ('kg', 177010)]
CHART 211	67.2600	[('BPM', 3484614), ('bpm', 1696195)]
CHART 578	94.0624	[('.', 277532), ('cmH20', 12941), ('', 4578)]
CHART 3451	94.3170	[('kg', 179206), ('cm', 10793), ('', 5)]
CHART 113	97.4122	[('mmHg', 1167662), ('%', 23460), ('', 7559)]
CHART 227441	99.3007	[('mg/dL', 142), ('units', 1)]
CHART 505	99.4428	[('cmH20', 350365), ('', 1958), ('cmH2O', 5)]


In [6]:
dropped_id = dropped
print(dropped_id, len(dropped_id))

[2848, 3565, 6158, 4349, 4718, 3370, 4466, 223837, 224890, 496, 8394, 6122, 4888, 2311, 6967, 599, 3323, 3071, 2769, 4321, 6169, 225099, 1820, 4875, 224854, 7477, 2627, 5729, 359, 225003, 3322, 228307, 227837, 1478, 3367, 1543, 7003, 1229, 227677, 6312, 2150, 3483, 7981, 4650, 224889, 3666, 4722, 224692, 8519, 1116, 224960, 4388, 8087, 5142, 7697, 7818, 5796, 2452, 227570, 6960, 6068, 5605, 5080, 537, 5099, 2300, 7214, 224450, 8372, 8178, 227782, 4475, 6680, 325, 223991, 223947, 1999, 227834, 1333, 6782, 4986, 3124, 5561, 5048, 225958, 3510, 7300, 7088, 5329, 4910, 227683, 3335, 6309, 227808, 228153, 8399, 227752, 1389, 1364, 227783, 655, 224084, 167, 6163, 224991, 5656, 478, 8523, 224988, 458, 4414, 629, 223782, 3564, 1606, 2761, 4355, 2693, 227240, 227929, 4360, 6210, 3492, 5347, 224633, 1000, 227474, 227961, 2542, 1963, 5210, 6118, 228448, 77, 225743, 140, 4976, 56, 6790, 3384, 4291, 8466, 224101, 228449, 4449, 5762, 7934, 224976, 227963, 224005, 223840, 712, 228446, 1910, 224935, 8

In [7]:
def numerical_ratio(units):
    res = list(map(lambda unit: re.match(r'(\d+\.\d*)|(\d*\.\d+)|(\d+)', unit), units))
    numerical_ratio = 1.0 * len([1 for r in res if r is not None]) / len(res)
    return numerical_ratio

In [8]:
def dropped_value_list_unit_task(dropped_id):
    conn = getConnection()
    dropped_value = []
    for d in tqdm(dropped_id):
#         print('LAB : ' + str(d))
        cur = conn.cursor()
        cur.execute('SELECT value, valueuom, count(*) as x FROM mimiciii.chartevents as lb \
                    WHERE itemid = '+ str(d) +' and hadm_id in (select * from admission_ids) GROUP BY value, valueuom ORDER BY x DESC')
        droped_outs = cur.fetchall()
        drop_array = []
        ct =0
        total = 0;
        for dx in droped_outs:
            total += dx[2]
#         print("Count ",total)
#         print([d[0] for d in droped_outs])
#         print('Numeric ratio', numerical_ratio([str(d[0]) for d in droped_outs]))
        units = []
        for dx in droped_outs:
            ct+=1
            if(ct>20):
                break
            dx = list(dx)
#             print(dx[1],dx[0],"\t",dx[2])
#         print('')
        dropped_value.append((d, droped_outs))
    conn.close()
    return dropped_value

dropped_value = []
numworkers = 4
p = Pool(numworkers)
dropped_id_units = np.array_split(dropped_id, numworkers)
dropped_value_list = [p.apply_async(dropped_value_list_unit_task, args=(dropped_id_unit,)) for dropped_id_unit in dropped_id_units]
dropped_value_list = [x.get() for x in dropped_value_list]
dropped_value = list(itertools.chain.from_iterable(dropped_value_list))
np.save('res/chart_dropped_value.npy', dropped_value)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=955.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=954.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=955.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=955.0), HTML(value='')))







In [9]:
print(len(dropped_id))
print(len(valid_chart), len(valid_chart_unit))
print(valid_chart)
print(valid_chart_unit)

3819
2644 2644
[2026, 3028, 2409, 225771, 6873, 2961, 2265, 1101, 227597, 8533, 6810, 225118, 224144, 227366, 3652, 228344, 227056, 5587, 6466, 1060, 26, 8549, 227346, 220739, 227021, 5565, 220277, 224178, 1726, 2640, 1487, 786, 220580, 226730, 227916, 7245, 2755, 3142, 541, 1582, 4198, 226746, 6003, 227429, 142, 226753, 6585, 227537, 226892, 3045, 7027, 2019, 6465, 6987, 8446, 227000, 226137, 6170, 1982, 220587, 2572, 8305, 71, 624, 626, 3242, 823, 1528, 8484, 8491, 1374, 1981, 6286, 2925, 2363, 228099, 225070, 228374, 2522, 211, 7632, 7493, 6964, 225112, 224953, 228146, 226772, 1966, 8048, 2373, 5865, 6129, 224054, 8301, 224674, 2112, 227543, 2339, 3082, 8362, 6657, 7534, 227872, 2992, 2783, 1542, 8517, 1551, 226980, 2717, 224367, 7353, 227448, 6058, 223765, 429, 224754, 226113, 3029, 2480, 2687, 6906, 7650, 2692, 3161, 6768, 226993, 6643, 530, 228336, 1475, 224684, 224675, 8061, 1978, 223763, 228163, 3663, 6409, 226775, 2353, 227039, 224332, 5558, 223848, 7262, 803, 7982, 494, 2856,

## Store selected features in first filtering

These features are all numerical features.

In [10]:
np.save('res/filtered_chart.npy',{'id':valid_chart,'unit':valid_chart_unit})
# np.save('res/filtered_chart_cate',{'id':[223758],'unit':None})
print('saved!')

saved!


## Divide dropped features in first filtering

- Features with the ratio of non-numerical values(values that cannot pass the parser) > 0.5: categorical features
- Features with the ratio of ratio values > 0.5: ratio features
- otherwise: (possible) numerical features, we will parse them later

In [11]:
dropped_value = np.load('res/chart_dropped_value.npy', allow_pickle=True).tolist()
valid_chart_num = []
valid_chart_num_unit = []
valid_chart_cate = []
valid_chart_ratio = []
for d, droped_outs in dropped_value:
    ascnum = 0
    rationum = 0
    for value, valueuom, count in droped_outs:
        value = str(value)
        isasc = re.search(r'(\d+\.\d*)|(\d*\.\d+)|(\d+)', value) is None
        isratio = re.fullmatch(r'{0}\/{0}'.format(r'((\d+\.\d*)|(\d*\.\d+)|(\d+))'), value) is not None
        if isasc:
            ascnum += 1
        if isratio:
            rationum += 1
    if ascnum / len(droped_outs) >= 0.5:
        valid_chart_cate.append(d)
    elif rationum / len(droped_outs) >= 0.5:
        valid_chart_ratio.append(d)
        print(droped_outs)
    else:
        valid_chart_num.append(d)
        if droped_outs[0][1] is None:
            valid_chart_num_unit.append('')
        else:
            valid_chart_num_unit.append(droped_outs[0][1])
#         print(droped_outs)
        
print(len(valid_chart_num), len(valid_chart_cate), len(valid_chart_ratio))
print(valid_chart_num, valid_chart_num_unit, valid_chart_ratio)

[('127/58', None, 3), ('132/61', None, 1), ('137/63', None, 1), ('128/53', None, 1), ('135/60', None, 1), ('140/68', None, 1), ('124/58', None, 1), ('130/59', None, 1), ('129/62', None, 1), ('157/67', None, 1), ('134/68', None, 1), ('215/91', None, 1), ('116/56', None, 1), ('110/57', None, 1), ('124/59', None, 1), ('144/60', None, 1), ('185/81', None, 1), ('122/58', None, 1), ('156/62', None, 1), ('129/58', None, 1), ('143/69', None, 1), ('138/61', None, 1), ('140/66', None, 1), ('122/54', None, 1), ('115/55', None, 1), ('133/56', None, 1), ('139/62', None, 1), ('120/56', None, 1), ('120/58', None, 1), ('143/62', None, 1), ('117/53', None, 1), ('124/54', None, 1), ('156/63', None, 1), ('120/57', None, 1), ('127/57', None, 1), ('130/60', None, 1), ('121/60', None, 1), ('123/57', None, 1), ('138/68', None, 1)]
[('110/46', None, 1), ('112/45', None, 1), ('119/52', None, 1), ('127/56', None, 1), ('126/56', None, 1), ('105/45', None, 1), ('111/42', None, 1), ('119/54', None, 1), ('129/59', 

[(None, None, 1), ('111/64', None, 1), ('98/60', None, 1), ('119/94', None, 1), ('119/74', None, 1), ('126/74', None, 1), ('121/72', None, 1), ('114/64', None, 1), ('89/59', None, 1), ('122/72', None, 1), ('123/72', None, 1), ('130/80', None, 1), ('97/65', None, 1), ('97/57', None, 1), ('109/63', None, 1), ('120/69', None, 1), ('93/61', None, 1), ('107/71', None, 1)]
[('151/63', None, 2), ('154/63', None, 2), ('143/56', None, 1), ('154/62', None, 1), ('130/54', None, 1), ('148/62', None, 1), ('140/55', None, 1), ('111/50', None, 1), ('139/58', None, 1), ('144/66', None, 1), ('170/66', None, 1), ('181/75', None, 1), ('158/65', None, 1), ('163/66', None, 1), ('131/61', None, 1), ('108/49', None, 1), ('154/64', None, 1), ('153/56', None, 1), ('115/52', None, 1), ('156/65', None, 1), ('145/50', None, 1), ('136/60', None, 1), ('172/77', None, 1), ('144/55', None, 1), ('158/67', None, 1), ('142/34', None, 1), ('122/55', None, 1), ('161/64', None, 1), ('141/61', None, 1), ('168/75', None, 1),

## Store 3 kinds of features

In [12]:
print(len(valid_chart_num), len(valid_chart_num_unit), len(valid_chart_cate))
print(valid_chart_num, valid_chart_num_unit, valid_chart_cate)
np.save('res/filtered_chart_num',{'id':valid_chart_num,'unit':valid_chart_num_unit})
np.save('res/filtered_chart_cate',{'id':valid_chart_cate,'unit':None})
np.save('res/filtered_chart_ratio', {'id': valid_chart_ratio, 'unit': None})

314 314 3405
[223837, 496, 2311, 3323, 3322, 3483, 8087, 227570, 225958, 227808, 3564, 1000, 227961, 712, 8410, 653, 8377, 65, 804, 1233, 224181, 5931, 2445, 5943, 1057, 5668, 417, 639, 1891, 225977, 228156, 222, 562, 2131, 1886, 8383, 7081, 3751, 228016, 405, 228007, 228009, 3359, 224413, 6402, 563, 479, 8384, 1771, 2145, 1451, 1394, 459, 3109, 851, 228013, 7045, 224716, 7372, 5, 227984, 5952, 669, 708, 534, 223907, 228372, 223925, 3111, 368, 2226, 372, 2945, 224069, 1837, 228008, 606, 373, 523, 225409, 228018, 228017, 1734, 404, 556, 3594, 2730, 6114, 2349, 1412, 690, 112, 7018, 1561, 225413, 6537, 227378, 7988, 8465, 1052, 224436, 483, 1342, 28, 585, 762, 1730, 97, 3321, 369, 226152, 1413, 826, 4186, 3591, 607, 1560, 858, 2774, 2062, 857, 8400, 2519, 793, 224831, 4183, 3618, 3601, 482, 6283, 3206, 1471, 2382, 1334, 1036, 6640, 227805, 308, 1456, 3299, 8403, 223, 3320, 2056, 852, 1627, 3622, 2185, 8413, 555, 225953, 8464, 39, 228442, 1469, 5321, 225979, 102, 2, 227131, 4603, 6131, 73