In [2]:
import boto3
import pandas as pd
from sagemaker import get_execution_role
role = get_execution_role()
s3 = boto3.resource('s3')

# ICUsICS DB tutorial

ICUsICS is an anonymized database built from the data stored into the Clinical Information System (CIS) database of 6 Intensive Care Units (ICUs) from the Catalan Institute of Health (ICS). Actually, it is a database of databases, because each ICU belongs to a different hospital and each CIS presents its own particularities. However, the tables architecture of each database is identical between hospitals, which facilitates data search and extraction.

ICUsICS is not hosted as a database 'per se', but as a directory of folders (tables) with csv files inside (registries). Inside icuics-db, there are 6 folders, 1 for each hospital, and inside them there are 10 folders:  

patients: it contains patient-level info (id, hospital, demografics and admission and discharge time and wards)  
diagnoses: table with the diagnoses  
insertions: table with the insertions  
variables_ref: it contains info of the variables present in the database (id, hospital, name, type). Key info: vartype 1, 2, 4 and 8 mean v_monitored, v_labres, v_observed and v_derived respectivelly (the table where the variable is stored). Key info: datatype 0, 1 and 2 mean numeric, categoric and checkbox respectivelly.  
v_monitored: table with registries for vartype 1 variables  
v_labres: table with registries for vartype 2 variables  
v_observed: table with registries for vartype 4 variables  
v_derived: table with registries for vartype 8 variables  
drugs_ref: it contains info of the drugs present in the database (id, hospital, name, formunit, unit, etc.)  
drugs: table with registries for drugs 

## Example of data search and extraction [Invasive Mechanical Ventilation (IMV) example]

### Import variables_ref tables

In [17]:
# set db and hospital

db='icusics-db-demo'
h='h3'

# import data into pandas dfs

variables_ref = pd.read_csv(f's3://{db}/{h}_db/variables_ref/{h}_variables_ref.csv')

### Look for the variable

In [47]:
# define key characters (remember that strings in this db can be in english, catalan or spanish language) to start a blind search

key_chars = 'vent|mec|inv'

result = variables_ref[(variables_ref['name'].str.contains(key_chars, case=False, na=False)) | (
    variables_ref['description'].str.contains(key_chars, case=False, na=False)) | (
    variables_ref['choicestringvalue'].str.contains(key_chars, case=False, na=False))]

print(queryset.shape)

print("To many results, so you decide to ask to the mentors and they say that for that hospital, this feature is a categorical (datatype=1) and observed (vartype=4) \
feature called 'Teràpia real O2' with the option 'Vent Mecànica'")

result2 = result[(result['datatype']==1) & (result['vartype']==4) & (result['name'].str.contains('Teràpia real O2', case=False, na=False)) & (
    result['choicestringvalue'].str.contains('Vent Mecànica', case=False, na=False))]

print('So you finally get your result:')
result2

(333, 9)
To many results, so you decide to ask to the mentors and they say that for that hospital, this feature is a categorical (datatype=1) and observed (vartype=4) feature called 'Teràpia real O2' with the option 'Vent Mecànica'
So you finally get your result:


Unnamed: 0,a_variableid,hospital_coded,vartype,datatype,name,abbreviation,description,choicecode,choicestringvalue
1949,3015002262,3,4,1,Teràpia real O2,O2 Teràpia,DI 21.CONTROL RESPIRATORI.\nVariable utilitzad...,12.0,Vent Mecànica


### Get all registries of a variable

In [33]:
%%time

bucket = s3.Bucket('icusics-db-demo')
imv_registries = pd.DataFrame()

for my_bucket_object in bucket.objects.all():
    
    if all(x in my_bucket_object.key for x in ['h3', 'v_observed']):
            
        chunk = pd.read_csv(f's3://{db}/{my_bucket_object.key}')
        df_chunk = chunk[(chunk['a_variableid']==3015002262) & (chunk['value']==12)]
        imv_registries = pd.concat([imv_registries, df_chunk])

In [34]:
imv_registries.head()

Unnamed: 0,a_patientid,a_variableid,time,value
80974,3961761,3015002262,3867,12.0
80977,3961761,3015002262,5307,12.0
80978,3961761,3015002262,4827,12.0
80979,3961761,3015002262,4347,12.0
80980,3961761,3015002262,6687,12.0


In [36]:
print('In hospital 3 there are', imv_registries['a_patientid'].nunique(), 'patients that received IMV')

In hospital 3 there are 3223 patients that received IMV


### Let's explore the percentage of patients with IMV from the total and the mortality of each group

In [45]:
patients = pd.read_csv(f's3://{db}/{h}_db/patients/{h}_patients_ref.csv')
print('The percentage of patients that received IMV from the total is',imv_registries['a_patientid'].nunique()/patients['a_patientid'].nunique()*100)
print('And the mortality according to each group (IMV vs no IMV) is, IMV:')
print(patients[patients['a_patientid'].isin(tuple(set(imv_registries['a_patientid'])))]['hospital_outcome'].value_counts(normalize=True))
print('no IMV:')
print(patients[~patients['a_patientid'].isin(tuple(set(imv_registries['a_patientid'])))]['hospital_outcome'].value_counts(normalize=True))
print('We can see how the mortality of those who received IMV is higher (31%) among those who do not (7%), and BTW, the overall mortality in hospital 3 is:')
print(patients['hospital_outcome'].value_counts(normalize=True))

The percentage of patients that received IMV from the total is 46.52136258660508
And the mortality according to each group (IMV vs no IMV) is, IMV:
ALIVE     0.687558
EXITUS    0.312442
Name: hospital_outcome, dtype: float64
no IMV:
ALIVE     0.926316
EXITUS    0.073684
Name: hospital_outcome, dtype: float64
We can see how the mortality of those who received IMV is higher (31%) among those who do not (7%), and BTW, the overall mortality in hospital 3 is:
ALIVE     0.815242
EXITUS    0.184758
Name: hospital_outcome, dtype: float64


Let's create a cohort that accomplishes the following inclusion/exclusion criteria:  
Patients 

In [49]:
patients.head(2)

Unnamed: 0,a_patientid,hospital_coded,patientsex,age,height,weight,bmi,hospadmtime,admwardname,distime,diswardname,hospdistime,hospital_outcome
0,3861270,3,M,70,160,80,31,-683,UCI GENERAL,14722,UCI GENERAL,34887,ALIVE
1,3477137,3,M,70,170,50,17,-1603,UCIM-UCO Joan XXIII,4227,UROLOGIA,11007,ALIVE


In [55]:
# Check k=5 anonymization:

patients.groupby(['patientsex','age','height','weight','hospital_outcome'], as_index=False).agg({'a_patientid':'nunique'})['a_patientid'].min()

5

In [59]:
variables_ref = pd.read_csv(f's3://{db}/{h}_db/variables_ref/{h}_variables_ref.csv')

In [71]:
key_chars = 'apache 2 validado'

result = variables_ref[(variables_ref['name'].str.contains(key_chars, case=False, na=False)) | (
    variables_ref['abbreviation'].str.contains(key_chars, case=False, na=False)) | (
    variables_ref['description'].str.contains(key_chars, case=False, na=False)) | (
    variables_ref['choicestringvalue'].str.contains(key_chars, case=False, na=False))]

result

Unnamed: 0,a_variableid,hospital_coded,vartype,datatype,name,abbreviation,description,choicecode,choicestringvalue
5790,3030000350,3,8,0,APACHE 2 validado,APACHE 2 man,Validated APACHE II score,,


In [None]:
key_chars = 'apache 2 validado|o2 teràpia'

result = variables_ref[(variables_ref['name'].str.contains(key_chars, case=False, na=False)) | (
    variables_ref['description'].str.contains(key_chars, case=False, na=False)) | (
    variables_ref['choicestringvalue'].str.contains(key_chars, case=False, na=False))]

result

In [72]:
%%time

bucket = s3.Bucket('icusics-db-demo')
a2_registries = pd.DataFrame()

for my_bucket_object in bucket.objects.all():
    
    if all(x in my_bucket_object.key for x in ['h3', 'v_derived']):
            
        chunk = pd.read_csv(f's3://{db}/{my_bucket_object.key}')
        df_chunk = chunk[(chunk['a_variableid']==3030000350) & (chunk['value']>20)]
        a2_registries = pd.concat([a2_registries, df_chunk])

CPU times: user 2min 8s, sys: 12.9 s, total: 2min 21s
Wall time: 6min 33s


In [73]:
%%time

bucket = s3.Bucket('icusics-db-demo')
imv_registries = pd.DataFrame()

for my_bucket_object in bucket.objects.all():
    
    if all(x in my_bucket_object.key for x in ['h3', 'v_observed']):
            
        chunk = pd.read_csv(f's3://{db}/{my_bucket_object.key}')
        df_chunk = chunk[(chunk['a_variableid']==3015002262) & (chunk['value']==12)]
        imv_registries = pd.concat([imv_registries, df_chunk])

CPU times: user 15.3 s, sys: 1.39 s, total: 16.7 s
Wall time: 46.5 s


### Create a cohort of patients with:  
1- ICU_LOS > 2 days  
2- Primary diagnose of pneumonia (any type) and secondary diagnose of myopathy (any type)  
3- Central Venous Cateter (CVC)  
4- Invasive Mechanical Ventilation (IMV)  
5- APACHE2 > 20  
6- Lactate > 2mmol/L  
7- HR > 200  
9- Vaso-Active Drugs (VAD)  

patients: it contains patient-level info (id, hospital, demografics and admission and discharge time and wards)  
diagnoses: table with the diagnoses  
insertions: table with the insertions  
variables_ref: it contains info of the variables present in the database (id, hospital, name, type). Key info: vartype 1, 2, 4 and 8 mean v_monitored, v_labres, v_observed and v_derived respectivelly (the table where the variable is stored). Key info: datatype 0, 1 and 2 mean numeric, categoric and checkbox respectivelly.  
v_monitored: table with registries for vartype 1 variables  
v_labres: table with registries for vartype 2 variables  
v_observed: table with registries for vartype 4 variables  
v_derived: table with registries for vartype 8 variables  
drugs_ref: it contains info of the drugs present in the database (id, hospital, name, formunit, unit, etc.)  
drugs: table with registries for drugs 

In [3]:
db='icusics-db-demo'
h='h3'

In [4]:
patients = pd.read_csv(f's3://{db}/{h}_db/patients/{h}_patients_ref.csv')

In [5]:
patients.head(1)

Unnamed: 0,a_patientid,hospital_coded,patientsex,age,height,weight,bmi,hospadmtime,admwardname,distime,diswardname,hospdistime,hospital_outcome
0,3861270,3,M,70,160,80,31,-683,UCI GENERAL,14722,UCI GENERAL,34887,ALIVE


In [6]:
los2d = patients[patients['distime']>2880] # 2 days = 2880 minutes

In [7]:
diags = pd.read_csv(f's3://{db}/{h}_db/diagnoses/{h}_diagnoses.csv')

In [8]:
diags.head(1)

Unnamed: 0,a_patientid,hospital_coded,diag_type,referencecode,referencecodename
0,3070757,3,secondary,976.0/2,farmacs antiinfecciosos i antiinflamatoris lo...


In [27]:
dp_pneumo_codes = tuple(set(diags[diags['referencecodename'].str.contains('pneum', case=False)]['referencecode']))
ds_miopat_codes = tuple(set(diags[diags['referencecodename'].str.contains('miopat', case=False)]['referencecode']))

In [37]:
dp_patlist = tuple(set(diags[(diags['diag_type']=='primary') & (diags['referencecode'].isin(dp_pneumo_codes))]['a_patientid']))
ds_patlist = tuple(set(diags[(diags['diag_type']=='secondary') & (diags['referencecode'].isin(ds_miopat_codes))]['a_patientid']))
d_patlist = list(set(dp_patlist).intersection(ds_patlist))

In [44]:
los2d_diags = los2d[los2d['a_patientid'].isin(d_patlist)]

In [43]:
los2d_diags['hospital_outcome'].value_counts()

ALIVE     71
EXITUS    10
Name: hospital_outcome, dtype: int64

In [45]:
insertions = pd.read_csv(f's3://{db}/{h}_db/insertions/{h}_insertions.csv')

In [46]:
insertions.head(1)

Unnamed: 0,a_patientid,a_insertionid,hospital_coded,starttime,endtime,insertionname,insertionplacename
0,3861270,3000000484,3,253,5798,TUB ENDOTRAQUEAL,BOCA


In [47]:
insertions['insertionname'].value_counts()

CATÈTER VENÓS PERIFÈRIC           21519
SONDA VESICAL                      5260
CATÈTER VENÓS CENTRAL              4542
CATÈTER ARTERIAL                   4438
SONDA DIGESTIVA                    3870
                                  ...  
DRENATGE PUNT INSERCIÓ PLEUREV        1
TUB ENDOTRAQUEAL 7,5                  1
CATÈTER NEFROS/URETEROSTOMIA I        1
DRENATGE 1 / CABEZA D                 1
DRENATGE  4 ABD IZQ                   1
Name: insertionname, Length: 1302, dtype: int64

In [50]:
insertions[insertions['insertionname'].str.contains('central')].value_counts()

a_patientid  a_insertionid  hospital_coded  starttime  endtime  insertionname                   insertionplacename
3345927      3000000315     3               5          777      CATÈTER central PERIFÈRIC       BRAÇ E                1
3792535      3000000318     3               615        3871     CATÈTER VENÓS central periféri  BASILICA D            1
dtype: int64

In [None]:
cvc_patlist = tuple(set(insertions[insertions['insertionname'].str.contains('central')]))

In [48]:
variables_ref = pd.read_csv(f's3://{db}/{h}_db/variables_ref/{h}_variables_ref.csv')

In [None]:
insertions = pd.read_csv(f's3://{db}/{h}_db/insertions/{h}_insertions.csv')