In [1]:

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Activation
from keras.layers import Masking
from keras.optimizers import RMSprop
from keras import backend as k
from sklearn.preprocessing import normalize


import os
import pandas as pd
import numpy as np
from sys import platform
from IPython.display import display, HTML
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import time
from contextlib import contextmanager
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
if platform == "linux":
    # linux
    INPUT_DIR='C:/Users/soufiane.chami/Desktop/PhysioNet/PhysioNet 2019/training/'
    FILES_DIR= 'C:/Users/soufiane.chami/Desktop/PhysioNet/PhysioNet 2019/'
elif platform == "darwin":
        # OS X
        INPUT_DIR='/Users/macbook/Desktop/PhysioNet/2019/training/'
        FILES_DIR= '/Users/macbook/Desktop/PhysioNet/2019/'
elif platform == "win32":
        # Windows...
        INPUT_DIR='C:/Users/soufiane.chami/Desktop/PhysioNet/PhysioNet 2019/training/'
        FILES_DIR= 'C:/Users/soufiane.chami/Desktop/PhysioNet/PhysioNet 2019/'


def get_subjects():
    cols =['subject_id','Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS', 'SepsisLabel']  #list(patient.columns[-7:])
    subjects= pd.DataFrame([], columns=cols)
    i=0
    # os.chdir('C:/Users/soufiane.chami/Desktop/PhysioNet 2019/training')
    for file in os.listdir():
        subject = pd.read_csv(file, sep = "|")
        subject_details = list(subject[cols[1:]].max().values)
        subject_details.insert(0,file.split('.')[0])
        subjects.loc[i]=subject_details
        i+=1
    subjects.rename(columns={'ICULOS':'nb_samples'}, inplace=True)
    return subjects

with timer("get genral data about subjects"):
    os.chdir(INPUT_DIR)
    subjects = get_subjects()
    os.chdir(FILES_DIR)
    subjects.to_csv("Subjects_ID_with_Labels.csv", index=False)
    
    # IDs of subjects wih Sepsis 
    Sepsis_subjects_id= subjects.loc[subjects.SepsisLabel.isin([1])].subject_id.values
    pd.DataFrame(Sepsis_subjects_id, columns=['Sepsis_subjects_id']).to_csv("Sepsis_subjects_id.csv", index=False)
    
    # IDs of subjects wihout Sepsis 
    wihoutSepsis_subjects_id= subjects.loc[subjects.SepsisLabel.isin([0])].subject_id.values
    pd.DataFrame(wihoutSepsis_subjects_id,
             columns=['wihoutSepsis_subjects_id']).to_csv("wihoutSepsis_subjects_id.csv", index=False)

    os.chdir(FILES_DIR)
    
with timer("Create Time To Failure columns"):
    list_of_subjects= pd.read_csv('Sepsis_subjects_id.csv')
    df = pd.DataFrame()
    for subject_id in list_of_subjects.Sepsis_subjects_id.values:
        subject= pd.read_csv('training/'+subject_id+'.psv', sep = "|")
        TTF= [f for f in subject.SepsisLabel.values if f==0 ]
        ttf_len= len(TTF)
#         subject = subject.loc[1:ttf_len]
        subject['Subject_ID']= subject_id
        subject['TTF']= [f for f in range(ttf_len-1, -1,-1)] + [0]*(len(subject) - ttf_len)
        df= df.append(subject)

    cols= [ 'Subject_ID', 'TTF','HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2','Age', 'Gender',
           'HospAdmTime', 'ICULOS', 'SepsisLabel']
    data =df[cols].set_index(['Subject_ID', 'TTF']).copy()

    
with timer("handling missing values"):
    y_cols= ["TTF", 'SepsisLabel']
    features = [f for f in data.columns if f not in y_cols+ ['Subject_ID', 'ICULOS']]
    df= data.reset_index().copy()
    df= df[features].interpolate()
    df = df.fillna(0)
    dat1= data.reset_index()[['Subject_ID', 'ICULOS']+y_cols]
    
with timer("Normalize the Data"):
    from sklearn import preprocessing
    x = df.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled, index=data.index, columns=features)
    df = pd.concat([dat1, df.reset_index()[features]], axis=1) 

get genral data about subjects - done in 48s
Create Time To Failure columns - done in 4s
handling missing values - done in 0s
Normalize the Data - done in 0s


In [3]:
df.head()

Unnamed: 0,Subject_ID,ICULOS,TTF,SepsisLabel,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,Age,Gender,HospAdmTime
0,p02236,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320513,0.0,1.0
1,p02236,2,0,1,0.360825,1.0,0.0,0.561873,0.315068,0.224561,0.21875,0.0,0.320513,0.0,1.0
2,p02236,3,0,1,0.371134,0.94,0.86715,0.494983,0.287671,0.196491,0.166667,0.0,0.320513,0.0,1.0
3,p02236,4,0,1,0.402062,0.97,0.868961,0.51505,0.308219,0.210526,0.1875,0.0,0.320513,0.0,1.0
4,p02236,5,0,1,0.443299,0.98,0.870773,0.461538,0.335616,0.245614,0.125,0.0,0.320513,0.0,1.0


In [4]:
df[df.Subject_ID=='p04445']

Unnamed: 0,Subject_ID,ICULOS,TTF,SepsisLabel,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,Age,Gender,HospAdmTime
9,p04445,1,102,0,0.369845,0.96,0.884058,0.425585,0.276541,0.203509,0.143229,0.0,0.679487,0.0,0.999759
10,p04445,2,101,0,0.31701,1.0,0.887279,0.329431,0.244863,0.182456,0.140625,0.0,0.679487,0.0,0.999759
11,p04445,3,100,0,0.340206,1.0,0.890499,0.294314,0.222603,0.182456,0.145833,0.0,0.679487,0.0,0.999759
12,p04445,4,99,0,0.278351,0.97,0.89372,0.401338,0.277397,0.196491,0.1875,0.0,0.679487,0.0,0.999759
13,p04445,5,98,0,0.283505,1.0,0.894928,0.38796,0.263699,0.185965,0.166667,0.0,0.679487,0.0,0.999759
14,p04445,6,97,0,0.252577,0.99,0.896135,0.341137,0.25,0.189474,0.145833,0.0,0.679487,0.0,0.999759
15,p04445,7,96,0,0.262887,1.0,0.8907,0.337793,0.256849,0.189474,0.135417,0.0,0.679487,0.0,0.999759
16,p04445,8,95,0,0.268041,1.0,0.885266,0.351171,0.25,0.178947,0.145833,0.0,0.679487,0.0,0.999759
17,p04445,9,94,0,0.329897,0.97,0.879831,0.377926,0.260274,0.196491,0.1875,0.0,0.679487,0.0,0.999759
18,p04445,10,93,0,0.293814,0.99,0.874396,0.301003,0.222603,0.17193,0.1875,0.0,0.679487,0.0,0.999759
