## Feature Engineering

This notebook takes the raw clinical data and builds corresponding features. There are a mix of continuous and categorical variables from the clinical data, and some contain more missing values than others. 

The general strategy is to window the data into 10 hour blocks, with a one hour prediction of sepsis/no sepsis. For each window, the following variables are retained as time series:
<br />-HR
<br />-MAP
<br />-O2Sat
<br />-SBP
<br />-Resp

The remainder of the variables are summarized as a single value, the median of the ten values in that window. This is a strategy to deal with the fact that there may be > 90% missing data for some variables.




Import libraries

In [14]:
import pandas as pd
import numpy as np
import pdb
import os
import shutil
import warnings

# from google_drive_downloader import GoogleDriveDownloader as gdd

Create the feats folder, or remove it if it exists

In [15]:
try:
    if os.path.exists('feats'):
        shutil.rmtree('feats')
    os.makedirs('feats')
except Exception as e:
    print(e)

Download the data

In [16]:
# #the link to download combined.pkl
# file_id = '1AmIJQ2oo7Cy1w32T8d1v-rXiJKM0wZE-'

# #load in the data and labels
# gdd.download_file_from_google_drive(file_id=file_id, dest_path='./combined.pkl')

df = pd.read_pickle('combined.pkl')

In [17]:
aa_df = pd.read_csv('/Users/zhaoxiuheng/Desktop/课程/Cognitive AI/project/code/training_setA/p000001.psv',sep='|')
aa_df

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,,,,,,,,,,,...,,,,83.14,0,,,-0.03,1,0
1,97.0,95.0,,98.0,75.33,,19.0,,,,...,,,,83.14,0,,,-0.03,2,0
2,89.0,99.0,,122.0,86.0,,22.0,,,,...,,,,83.14,0,,,-0.03,3,0
3,90.0,95.0,,,,,30.0,,24.0,,...,,,,83.14,0,,,-0.03,4,0
4,103.0,88.5,,122.0,91.33,,24.5,,,,...,,,,83.14,0,,,-0.03,5,0
5,110.0,91.0,,,,,22.0,,,,...,,,,83.14,0,,,-0.03,6,0
6,108.0,92.0,36.11,123.0,77.0,,29.0,,,,...,,,,83.14,0,,,-0.03,7,0
7,106.0,90.5,,93.0,76.33,,29.0,,,,...,,,,83.14,0,,,-0.03,8,0
8,104.0,95.0,,133.0,88.33,,26.0,,,,...,,,,83.14,0,,,-0.03,9,0
9,102.0,91.0,,134.0,87.33,,30.0,,,,...,,,,83.14,0,,,-0.03,10,0


In [18]:
df

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,patient
0,,,,,,,,,,,...,,,83.14,0,,,-0.03,1,0,p000001
1,97.0,95.0,,98.0,75.33,,19.0,,,,...,,,83.14,0,,,-0.03,2,0,p000001
2,89.0,99.0,,122.0,86.00,,22.0,,,,...,,,83.14,0,,,-0.03,3,0,p000001
3,90.0,95.0,,,,,30.0,,24.0,,...,,,83.14,0,,,-0.03,4,0,p000001
4,103.0,88.5,,122.0,91.33,,24.5,,,,...,,,83.14,0,,,-0.03,5,0,p000001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552205,80.0,96.0,,115.0,87.00,65.0,15.0,,,,...,,,62.00,0,,,0.00,31,0,p120000
1552206,74.0,97.0,,114.0,83.00,67.0,15.0,,,,...,,,62.00,0,,,0.00,32,0,p120000
1552207,78.0,98.0,,110.0,83.00,69.0,15.0,,,,...,,,62.00,0,,,0.00,33,0,p120000
1552208,82.0,99.0,36.6,124.0,91.00,71.0,16.0,,,,...,,,62.00,0,,,0.00,34,0,p120000


Set up the columns

In [19]:
#get the percentage missing for each column
print('Percentage Missing:')
print(df.isna().sum()/len(df))

#columns to drop
#drop Unit2 because Unit1 and Unit2 are mutually exclusive
#drop ICULOS as it's basically just an index
cols_to_drop = ['Unit2', 'ICULOS']
df = df.drop(cols_to_drop, axis=1)

#columns with < 15% missing data, and continuous data. these will be retained as time series
cols_cont = ['HR', 'MAP', 'O2Sat', 'SBP', 'Resp']

#columns with continuous data and > 15% missing data
cols_to_bin = ['Unit1', 'Gender', 'HospAdmTime', 'Age', 'DBP', 'Temp', 'Glucose', 'Potassium', 'Hct', 'FiO2', 'Hgb', 'pH', 'BUN', 'WBC', 'Magnesium', 'Creatinine', 'Platelets', 'Calcium', 'PaCO2', 'BaseExcess', 'Chloride', 'HCO3', 'Phosphate', 'EtCO2', 'SaO2', 'PTT', 'Lactate', 'AST', 'Alkalinephos', 'Bilirubin_total', 'TroponinI', 'Fibrinogen', 'Bilirubin_direct']


Percentage Missing:
HR                  0.098826
O2Sat               0.130611
Temp                0.661627
SBP                 0.145770
MAP                 0.124513
DBP                 0.313459
Resp                0.153546
EtCO2               0.962868
BaseExcess          0.945790
HCO3                0.958106
FiO2                0.916658
pH                  0.930697
PaCO2               0.944401
SaO2                0.965494
AST                 0.983776
BUN                 0.931344
Alkalinephos        0.983932
Calcium             0.941161
Chloride            0.954603
Creatinine          0.939044
Bilirubin_direct    0.998074
Glucose             0.828943
Lactate             0.973299
Magnesium           0.936896
Phosphate           0.959863
Potassium           0.906891
Bilirubin_total     0.985092
TroponinI           0.990477
Hct                 0.911460
Hgb                 0.926176
PTT                 0.970559
WBC                 0.935932
Fibrinogen          0.993402
Platelets           0.9

Calculate the mean/std for standardization for each variable. Leave out a random 8000 patients as the test set. In other words don't include a random 4000 patients when calculating the mean/std scaling parameters.

In [20]:
patients_training_data = df['patient'].unique()
np.random.shuffle(patients_training_data)
patients_training_data = patients_training_data[0:-6000]

df_mean_std = df[df['patient'].isin(patients_training_data)].describe().loc[['mean', 'std']]
df_mean_std.to_pickle('mean_std_scaling.pkl')

In [21]:
df_mean_std

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Hgb,PTT,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,HospAdmTime,SepsisLabel
mean,84.574709,97.192092,36.97504,123.809814,82.474044,63.875421,18.733432,32.859093,-0.697153,24.085834,...,10.431477,41.306352,11.410357,285.792167,196.106729,61.98254,0.558197,0.497994,-57.297624,0.017888
std,17.325632,2.94882,0.769263,23.250175,16.354169,13.971458,5.094273,8.005832,4.300789,4.371377,...,1.969614,26.306286,7.261413,153.098824,103.765474,16.407827,0.496602,0.499996,168.30993,0.132543


In [22]:
df_mean_std.isnull().values.any()

False

In [23]:
len(patients_training_data), len(df['patient'].unique())

(34336, 40336)

In [24]:
print('Number of positive/negative training examples:')
sum(df[df['patient'].isin(patients_training_data)]['SepsisLabel']==1), sum(df[df['patient'].isin(patients_training_data)]['SepsisLabel']==0)

Number of positive/negative training examples:


(23641, 1298001)

Loop through each subject and grab a window of 10 hours, with an output label associated with the 11th hour (ie predict one hour ahead). Note that you will need to create a directory called "feats" for this to run.

In [25]:
#loop through each patient at a time
save_count = 0
windowed_df_list = []
grouped_by_patient = df.groupby('patient')
for patient, group in grouped_by_patient:
    # print(patient)
    group = group.reset_index(drop=True)

    #backfill any missing values for the continuous variables with < 15% missing data
    # group = group.assign(HR=group['HR'].fillna(method='bfill').fillna(method='ffill'))
    # group = group.assign(MAP=group['MAP'].fillna(method='bfill').fillna(method='ffill'))
    # group = group.assign(O2Sat=group['O2Sat'].fillna(method='bfill').fillna(method='ffill'))
    # group = group.assign(SBP=group['SBP'].fillna(method='bfill').fillna(method='ffill'))
    # group = group.assign(Resp=group['Resp'].fillna(method='bfill').fillna(method='ffill'))
    group = group.assign(HR=group['HR'].ffill().bfill())
    group = group.assign(MAP=group['MAP'].ffill().bfill())
    group = group.assign(O2Sat=group['O2Sat'].ffill().bfill())
    group = group.assign(SBP=group['SBP'].ffill().bfill())
    group = group.assign(Resp=group['Resp'].ffill().bfill())
    
    # bin data
    for col in cols_to_bin:
        group = group.assign(**{col: group[col].ffill().bfill()})
    
    
    # standardize the continous data
    group = group.assign(HR=(group['HR']-df_mean_std['HR']['mean'])/(df_mean_std['HR']['std']))
    group = group.assign(MAP=(group['MAP']-df_mean_std['MAP']['mean'])/(df_mean_std['MAP']['std']))
    group = group.assign(O2Sat=(group['O2Sat']-df_mean_std['O2Sat']['mean'])/(df_mean_std['O2Sat']['std']))
    group = group.assign(SBP=(group['SBP']-df_mean_std['SBP']['mean'])/(df_mean_std['SBP']['std']))
    group = group.assign(Resp=(group['Resp']-df_mean_std['Resp']['mean'])/(df_mean_std['Resp']['std']))

    #generate windows of 10 hours, predicting one sample into the future
    windowed_data = []
    N = len(group)
    win_len = 10
    pred_len = 1
    i = 0
    while(i+win_len+pred_len <= N):
        tmp_data = group.iloc[i:i+win_len]
        tmp_label = group.iloc[i+win_len:i+win_len+pred_len]
        tmp_label = int(any(tmp_label['SepsisLabel']))
        tmp_patient = patient

        #slide the window forward
        i = i+1

        #get all the continuous variables into one group
        X_cont = tmp_data[cols_cont]
        X_cont = X_cont.values

        #if any of the continuous variables is nan (in other words, there wasn't even a single value to 
        #backfill/forwardfill) then just skip this window
        if np.isnan(X_cont).any(): continue

        #process each of the variables to be binned
        X_binned_dict = {}
        for col_to_bin in cols_to_bin:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                tmp_val = tmp_data[col_to_bin].median()
            if col_to_bin not in ['Gender', 'Unit1']:
                tmp_val = (tmp_val-df_mean_std[col_to_bin]['mean'])/df_mean_std[col_to_bin]['std']
                
            X_binned_dict[col_to_bin] = tmp_val
        
        #package it all into a dictionary
        tmp_dict = X_binned_dict
        tmp_dict['X_cont'] = X_cont
        tmp_dict['label'] = tmp_label
        tmp_dict['patient'] = tmp_patient
        windowed_data.append(tmp_dict)
        
    #append the dataframe to the list of dataframes
    windowed_data_df = pd.DataFrame(windowed_data)
    windowed_df_list.append(windowed_data_df)

    #periodically save every 500 patients
    if (int(patient[-5:]) % 500) == 0:
        print('patient %i' % int(patient[-5:]))
        windowed_df = pd.concat(windowed_df_list).reset_index(drop=True)
        train = windowed_df[windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)
        test = windowed_df[~windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)

        train.to_pickle('feats/train_%i.pkl' % save_count)
        test.to_pickle('feats/test_%i.pkl' % save_count)

        windowed_df_list = []
        save_count = save_count+1

#save any remaining data
if len(windowed_df_list) > 0:
    print('*******')
    print(len(windowed_df_list))
    #separate the training and testing data
    windowed_df = pd.concat(windowed_df_list).reset_index(drop=True)
    train = windowed_df[windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)
    test = windowed_df[~windowed_df['patient'].isin(patients_training_data)].drop('patient', axis=1)

    train.to_pickle('feats/train_%i.pkl' % save_count)
    test.to_pickle('feats/test_%i.pkl' % save_count)



patient 500
patient 1000
patient 1500
patient 2000
patient 2500
patient 3000
patient 3500
patient 4000
patient 4500
patient 5000
patient 5500
patient 6000
patient 6500
patient 7000
patient 7500
patient 8000
patient 8500
patient 9000
patient 9500
patient 10000
patient 10500
patient 11000
patient 11500
patient 12000
patient 12500
patient 13000
patient 13500
patient 14000
patient 14500
patient 15000
patient 15500
patient 16000
patient 16500
patient 17000
patient 17500
patient 18000
patient 18500
patient 19000
patient 19500
patient 20000
patient 20500
patient 500
patient 1000
patient 1500
patient 2000
patient 2500
patient 3000
patient 3500
patient 4000
patient 4500
patient 5000
patient 5500
patient 6000
patient 6500
patient 7000
patient 7500
patient 8000
patient 8500
patient 9000
patient 9500
patient 10000
patient 10500
patient 11000
patient 11500
patient 12000
patient 12500
patient 13000
patient 13500
patient 14000
patient 14500
patient 15000
patient 15500
patient 16000
patient 16500
pati