# Script to clean the data extracted from MIMIC, and save clean version for future use.

- Remove clearly anomalous values.
- Sum GCS components, remove individual components, and introduce GCS_total.
- Convert units for consisitency with GICU (UK data).
- Introduce 'airway' variable (remove PEEP and AIRWAY)...this is a reduced version of current AIRWAY

In [1]:
import graphlab
import graphlab.aggregate as agg

*Note: in the following mapping we exclude venous po2 and pco2, and diastolic bp.

In [2]:
variable_mapping = dict()
variable_mapping['fio2'] = [226754, 227009, 227010,223835]
variable_mapping['resp'] = [220210, 224688, 224689, 224690]
variable_mapping['po2'] = [226770,227039,220224] 
variable_mapping['pco2'] = [220235,227036]  
variable_mapping['temp'] = [223761, 223762, 224027] 
variable_mapping['hr'] = [220045]
variable_mapping['bp'] = [220050, 220059, 220179, 224167, 225309, 227243, 226850, 226852]
variable_mapping['k'] = [220640, 227464, 227442, 226772, 226535]
variable_mapping['na'] = [220645, 226534, 226776]
variable_mapping['hco3'] = [224826, 226759, 227443]
variable_mapping['spo2'] = [220227, 220277, 226860,226861,226862,226863,226865,228232]
variable_mapping['bun'] = [225624, 227000, 227001]
variable_mapping['airway'] = [223838, 224832, 224391, 227810,223837, 224829]
variable_mapping['gcs'] = [220739, 223900, 223901, 226755, 226756, 226757, 226758, 227011, 227012, 227013, 227014,228112]
variable_mapping['creatinine'] = [220615, 226752, 227005]
variable_mapping['pain'] = [223791, 227881]
variable_mapping['urine'] = [227519, 227059]
variable_mapping['haemoglobin'] = [220228]
variable_mapping['peep'] = [220339, 224699, 224700]

We will use the following dictionary of tuples to check for non-physical variable values:

In [3]:
physical_limit = dict()
physical_limit['creatinine'] = (0.0, 50.0)
physical_limit['fio2'] = (0.0, 100.0)     
physical_limit['resp'] = (0.0,100.0)      
physical_limit['po2'] = (0.0,500.0)
physical_limit['pco2'] = (0.0,500.0) 
physical_limit['temp'] = (80.0, 120.0)    
physical_limit['hr'] = (0.0, 250.0)  
physical_limit['bp'] = (0.0, 500.0) 
physical_limit['k'] = (0.0, 50.0)   
physical_limit['na'] = (0.0, 500.0)  
physical_limit['hco3'] = (0,100.0)    
physical_limit['spo2'] = (10.0, 100.0) 
physical_limit['bun'] = (0.0,300.0)  
physical_limit['gcs'] = (0.0, 16.0)  
physical_limit['pain'] = (0.0, 10.0) 
physical_limit['peep'] = (0.0, 50.0)          
physical_limit['haemoglobin'] = (0.0, 100.0)  
physical_limit['airway'] = (None,None)

In [4]:
def _remove_anomalies(data, physical_limit, variable_dict, verbose=False):
    
    ## define which ITEMIDs correspond to which variables.  
    get_var = lambda it: [variable for variable,items in variable_dict.iteritems() if it in items][0]
    all_itemids = [item for sublist in variable_dict.values() for item in sublist]
    var_id = {item: get_var(item) for item in all_itemids}
    
    data['remove_col'] = data.apply(lambda row: 1 if row['C.VALUENUM']>physical_limit[var_id[row['C.ITEMID']]][1] or row['C.VALUENUM']<physical_limit[var_id[row['C.ITEMID']]][0] else 0)
    
    ## now filter the data frame:
    print "%d rows to remove:" %sum(data['remove_col']==1)
    
    if verbose:
        print "For example:"
        removed = data[data['remove_col']==1].sample(0.001)
        removed.print_rows(num_rows=20)
    
    data = data[data['remove_col']==0]
    data = data.remove_column('remove_col')
    print "Anomalies removed."
    return data

#### Load data and remove anomalies: 

In [5]:
all_data = graphlab.SFrame('mimic_all_data/')
all_data = all_data.filter_by(column_name='C.ITEMID', values=[226062,226063,227516,228151], exclude=True)
all_data = _remove_anomalies(all_data, physical_limit, variable_mapping)

This non-commercial license of GraphLab Create for academic use is assigned to cm1788@bristol.ac.uk and will expire on October 04, 2019.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1553265369.log


250927 rows to remove:
Anomalies removed.


We add column to data that says what type of variable it is (i.e. the variable name):

In [6]:
get_var = lambda it: [variable for variable,items in variable_mapping.iteritems() if it in items][0]
all_data['VARIABLE'] = all_data['C.ITEMID'].apply(lambda itemid: get_var(itemid))
all_data.materialize()

#### We now sum GCS components:  
(And remove those that do not have all 3 cpts measured)

In [13]:
gcs_summed = all_data.filter_by(
                        column_name='C.ITEMID', 
                        values=variable_mapping['gcs']
                                ).groupby(
                                        key_columns=['C.ICUSTAY_ID', 'C.CHARTTIME'], 
                                        operations={'C.VALUENUM':agg.SUM('C.VALUENUM'), 
                                                     'ncpts':agg.COUNT('C.VALUENUM'),
                                                     'hrs_bd':agg.SELECT_ONE('hrs_bd'),
                                                     'C.HADM_ID':agg.SELECT_ONE('C.HADM_ID'),
                                                     'C.SUBJECT_ID':agg.SELECT_ONE('C.SUBJECT_ID'),
                                                     'C.ITEMID':agg.MIN('C.ITEMID'),
                                                     'C.VALUE':agg.SELECT_ONE('C.VALUE',),
                                                     'C.VALUEUOM':agg.SELECT_ONE('C.VALUEUOM'),
                                                     'D.LABEL':agg.SELECT_ONE('D.LABEL'),
                                                     'D.UNITNAME':agg.SELECT_ONE('D.UNITNAME'),
                                                     'II.INTIME':agg.SELECT_ONE('II.INTIME'),
                                                     'II.LOS':agg.SELECT_ONE('II.LOS'),
                                                     'II.OUTTIME':agg.SELECT_ONE('II.OUTTIME'),
                                                     'final_4hr':agg.SELECT_ONE('final_4hr'),
                                                     'final_24hr':agg.SELECT_ONE('final_24hr'),
                                                     'cohort':agg.SELECT_ONE('cohort'),
                                                     'in_h_death':agg.SELECT_ONE('in_h_death'),
                                                     'in_icu_death':agg.SELECT_ONE('in_icu_death'),
                                                     'readmit':agg.SELECT_ONE('readmit'),
                                                     'outcome':agg.SELECT_ONE('outcome'),
                                                     'VARIABLE':agg.SELECT_ONE('VARIABLE')
                                                    })

About 1% of GCS measures are not 'full' (i.e. one or more cpt missing)
We - remove these because it would be unfair to test on two components without altering the test.

In [14]:
_frac_incomplete = sum(gcs_summed['ncpts']<3)/float(len(gcs_summed))
print "%.4f of gcs measures not complete." %_frac_incomplete
gcs_summed = gcs_summed[gcs_summed['ncpts']==3]

0.0127 of gcs measures not complete.


#### Remove all single GCS components and append the complete GCS values to the data:

In [15]:
all_data = all_data.filter_by(column_name='C.ITEMID', values=variable_mapping['gcs'], exclude=True)
gcs_summed = gcs_summed.remove_column('ncpts')
all_data = all_data.append(gcs_summed)
## VARIABLE='GCS' is now GCS total

#### And convert all required units to be consistent with our UK data:

In [16]:
units_to_convert = ['creatinine', 'temp', 'po2', 'pco2', 'bun']

conv_crea = lambda crea: 88.42 * crea  ## convert from mg/dL (MIMIC) to umol/L (GICU)
conv_temp = lambda temp: (temp-32)/1.8 ## convert from Farenhiet (MIMIC) to Celcius (GICU)
conv_bg = lambda gas: 0.1333 * gas   ## convert blod gas from mmHg (MIMIC) to kPa (GICU)
conv_bun = lambda bun: 0.3571 * bun  ## convert mg/dL (MIMC) to mmol/L (GICU)

def _convert(var, val):
    
    new_val = None
    if var=='creatinine':
        new_val = conv_crea(val)
    elif var=='temp':
        if val==None:
            new_val=None
        else:
            new_val = conv_temp(val)
    elif var=='po2' or var=='pco2':
        new_val = conv_bg(val)
    elif var=='bun':
        new_val = conv_bun(val)
        
    return new_val

In [17]:
all_data['C.VALUENUM'] = all_data.apply(lambda row: _convert(row['VARIABLE'], row['C.VALUENUM']) if (row['VARIABLE'] in units_to_convert and row['C.VALUENUM']!=None) else row['C.VALUENUM']) 
all_data.materialize()

#### We now work out if airway is patent...

- We simply use presence of ETT as proxy for non-patent airway. 
- remove PEEP (it would be possible to stipulate PEEP + ETT -> non-patent, but simultaneity calculation is an unecessary complication).

In [18]:
airway_reduced = all_data.filter_by(
                            column_name='C.ITEMID', 
                            values=variable_mapping['airway']
                            ).groupby(key_columns=['C.ICUSTAY_ID', 'C.CHARTTIME'], 
                                      operations={'hrs_bd':agg.SELECT_ONE('hrs_bd'),
                                                 'C.HADM_ID':agg.SELECT_ONE('C.HADM_ID'),
                                                 'C.SUBJECT_ID':agg.SELECT_ONE('C.SUBJECT_ID'),
                                                 'C.ITEMID':agg.MIN('C.ITEMID'),
                                                 'C.VALUE':agg.SELECT_ONE('C.VALUE',),
                                                 'C.VALUEUOM':agg.SELECT_ONE('C.VALUEUOM'),
                                                 'D.LABEL':agg.SELECT_ONE('D.LABEL'),
                                                 'D.UNITNAME':agg.SELECT_ONE('D.UNITNAME'),
                                                 'II.INTIME':agg.SELECT_ONE('II.INTIME'),
                                                 'II.LOS':agg.SELECT_ONE('II.LOS'),
                                                 'II.OUTTIME':agg.SELECT_ONE('II.OUTTIME'),
                                                 'final_4hr':agg.SELECT_ONE('final_4hr'),
                                                 'final_24hr':agg.SELECT_ONE('final_24hr'),
                                                 'cohort':agg.SELECT_ONE('cohort'),
                                                 'in_h_death':agg.SELECT_ONE('in_h_death'),
                                                 'in_icu_death':agg.SELECT_ONE('in_icu_death'),
                                                 'readmit':agg.SELECT_ONE('readmit'),
                                                 'outcome':agg.SELECT_ONE('outcome'),
                                                 'VARIABLE':agg.SELECT_ONE('VARIABLE')
                                                })

airway_reduced['C.VALUENUM'] = airway_reduced['C.ICUSTAY_ID'].apply(lambda x: 1.0)

#### Remove all PEEP and AIRWAY:

In [19]:
all_data = all_data.filter_by(column_name='C.ITEMID', values=variable_mapping['peep'], exclude=True)
all_data = all_data.filter_by(column_name='C.ITEMID', values=variable_mapping['airway'], exclude=True)

In [20]:
airway_reduced['C.VALUENUM'] = airway_reduced['C.VALUENUM'].apply(lambda x: 1.0)

#### Add reduced airway variable back in and save this 'clean' version of the data:

In [21]:
all_data = all_data.append(airway_reduced)
all_data.save('mimic_all_data_CLEANED')

#### We now extract the CALLOUT times (RFD flags) for each icustay and add these to the data: 

For each ICUSTAY want to get the corresponding successful CALLOUTs (only those with outcomes marked as 'Discharged'). There are a small number of stays with mutliple succesful discharges, we remove these instances from the data to avoid confusion (they correspond the patients being transfered between different ICUs).

In [22]:
## Reloading: all_data = graphlab.SFrame('mimic_all_data_CLEANED/') 

In [None]:
%load_ext sql
%sql mysql://root:mysql2016@localhost/MIMIC?unix_socket=/run/mysqld/mysqld.sock
%sql USE MIMIC

In [24]:
callouts = %sql SELECT * FROM CALLOUT
callouts = graphlab.SFrame(callouts.DataFrame())
_stays = all_data.groupby(key_columns='C.ICUSTAY_ID', operations={'HADM_ID':agg.SELECT_ONE('C.HADM_ID'), 'IN':agg.SELECT_ONE('II.INTIME'), 'OUT':agg.SELECT_ONE('II.OUTTIME')})
_stays_join = _stays.join(callouts, how='inner', on='HADM_ID')
_stays_join['RFD'] = _stays_join['CREATETIME'].apply(lambda ti: ti.replace(tzinfo=None))
_stays_join['callout_match'] = _stays_join.apply(lambda row: 1 if (row['RFD']>=row['IN'] and row['RFD']<=row['OUT']) else 0)
counts = _stays_join[(_stays_join['callout_match']==1) * (_stays_join['CALLOUT_OUTCOME']=='Discharged')].groupby(key_columns='C.ICUSTAY_ID', operations={'count':agg.COUNT('RFD')})
print "There are %d stays with more than one successful CALLOUT (i.e. transfers)." %sum(counts['count']>1)

34499 rows affected.
There are 208 stays with more than one successful CALLOUT (i.e. transfers).


#### Remove these stays: 

In [25]:
remove_stays = counts[counts['count']>1]['C.ICUSTAY_ID']
_stays_join = _stays_join[(_stays_join['callout_match']==1) * (_stays_join['CALLOUT_OUTCOME']=='Discharged')]
_stays_join = _stays_join.filter_by(column_name='C.ICUSTAY_ID', values=remove_stays, exclude=True)

In [26]:
print "Of the original %d icu stays," %len(all_data['C.ICUSTAY_ID'].unique())
print "%d remain (have successful single callouts)" %len(_stays_join['C.ICUSTAY_ID'].unique())

Of the original 14430 icu stays,
11286 remain (have successful single callouts)


Note: successful discharge here is different to "positive outcome" follwoing discharge. Successful simply means the patient did actually leave the icu (i.e. the callout was acted upon) 

#### We now join the CALLOUTS (RFD flags) to the original data:

In [27]:
_stays_join = _stays_join['C.ICUSTAY_ID', 'RFD']
all_data = all_data.join(_stays_join, how='inner', on='C.ICUSTAY_ID')

#### We now add a column 'hrs_bRFD':

This is similar to the column 'hrs_bd' which we added previoulsy. We will filter on this column later to construct the feature matrix.

In [28]:
all_data['hrs_bRFD'] = all_data.apply(lambda row: (row['RFD'] - row['C.CHARTTIME']).total_seconds() / float(60**2))
all_data.save('mimic_all_data_CLEANED')

In [29]:
print "There are a total of %d rows in the cleaned data." %len(all_data)

There are a total of 5495513 rows in the cleaned data.
