# Valid Dataset

Here we only keep admissions which are the first admissions of some patients and are of patients >= 15 years old.

In [1]:
from __future__ import print_function

import psycopg2
import datetime
import sys
from operator import itemgetter, attrgetter, methodcaller
import numpy as np
import itertools
import os.path
import matplotlib.pyplot as plt
import math
from multiprocessing import Pool, cpu_count
import re
import traceback
import shutil

from utils import getConnection
from utils import parseUnitsMap
from utils import parseNum
from utils import sparsify

%matplotlib inline

In [2]:
_adm_first = np.load('res/admission_first_ids.npy').tolist()
admission_first_ids_list = _adm_first['admission_ids']

admission_ids = [re.match(r'adm\-(\d+)\.npy', x) for x in os.listdir('admdata/')]
admission_ids = [int(x.group(1)) for x in admission_ids if x is not None]
print(len(admission_ids), admission_ids[:10])

admission_first_ids_set = set(admission_first_ids_list)
admission_first_ids = [x for x in admission_ids if x in admission_first_ids_set]
print(len(admission_first_ids), admission_first_ids[:10])

58576 [194627, 137030, 152528, 147541, 146480, 196830, 181257, 187851, 129808, 162786]
46283 [194627, 137030, 152528, 147541, 146480, 196830, 181257, 187851, 152565, 170719]


In [3]:
TARGETDIR = '../../Data/admdata_valid'
if not os.path.exists(TARGETDIR):
    os.makedirs(TARGETDIR)

## Store valid data

We store all datafiles belonging to valid admission ids in a specific folder (../../Data/admdata_valid)

In [4]:
def copy_valid_admissions(aid):
    if aid in admission_first_ids_set:
        res = np.load(os.path.join('admdata', 'adm-%.6d.npy' % aid)).tolist()
        general = res['general']
        age = general[2]
        if age >= 15 * 365.25:
            np.save(os.path.join(TARGETDIR, 'adm-%.6d.npy' % aid), res)
        
p = Pool()
p.map(copy_valid_admissions, admission_ids)
p.close()
p.join()