In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from pandas.plotting import register_matplotlib_converters

  from ._conv import register_converters as _register_converters


In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 16, 10

In [3]:
print(tf.__version__)

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

1.15.2


In [4]:
df = pd.read_csv('../../eICU/training/finalData.csv')
df

Unnamed: 0,patientunitstayid,observationoffset,temperature,heartrate,respiration,systemicsystolic,creatinine,wbcx1000,lactate,urineoutputbyweight,diagnosis
0,141227,-1893.0,38.088274,112.0,49.0,122.146981,1.40,48.20,4.300000,2.433090,0
1,141227,-1773.0,38.088274,112.0,49.0,122.146981,1.40,48.20,4.300000,2.433090,0
2,141227,-1663.0,38.088274,112.0,49.0,122.146981,1.40,48.20,4.300000,2.433090,0
3,141227,-1566.0,38.088274,112.0,49.0,122.146981,1.40,48.20,4.300000,2.433090,0
4,141227,-1351.0,38.088274,112.0,49.0,122.146981,1.40,47.95,4.300000,2.433090,0
...,...,...,...,...,...,...,...,...,...,...,...
3543020,3353254,5326.0,38.088274,82.0,12.0,122.146981,1.65,11.32,2.584809,4.767580,0
3543021,3353254,5491.0,38.088274,82.0,12.0,122.146981,1.66,11.51,2.584809,2.383790,0
3543022,3353254,5558.0,38.088274,82.0,12.0,122.146981,1.67,11.70,2.584809,4.767580,0
3543023,3353254,5926.0,38.088274,82.0,12.0,122.146981,1.67,11.70,2.584809,7.151371,0


In [5]:
df = df.astype({'diagnosis': int})
df['diagnosis'].value_counts()

0    3534769
1       8256
Name: diagnosis, dtype: int64

In [6]:
len(df['patientunitstayid'].unique())

18271

In [7]:
train_size = int(len(df) * 0.8)
test_size = len(df) - train_size
train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]
print(len(train), len(test))

2834420 708605


# Preprocessing

In [8]:
from sklearn.preprocessing import RobustScaler

f_columns = ['patientunitstayid','observationoffset','temperature', 'heartrate', 'respiration', 'systemicsystolic', 'creatinine', 'wbcx1000', 'lactate', 'urineoutputbyweight']

f_transformer = RobustScaler()
cnt_transformer = RobustScaler()

f_transformer = f_transformer.fit(train[f_columns].to_numpy())
cnt_transformer = cnt_transformer.fit(train[['observationoffset']])

train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy())
train['observationoffset'] = cnt_transformer.transform(train[['observationoffset']])

test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy())
test['observationoffset'] = cnt_transformer.transform(test[['observationoffset']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [9]:
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)        
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

In [13]:
time_steps = 10

# reshape to [samples, time_steps, n_features]

X_train, y_train = create_dataset(train, train.observationoffset, time_steps)
X_test, y_test = create_dataset(test, test.observationoffset, time_steps)

print(X_train.shape, y_train.shape)

(2834410, 10, 11) (2834410,)


In [14]:
model = keras.Sequential()
model.add(
  keras.layers.Bidirectional(
    keras.layers.LSTM(
      units=128, 
      input_shape=(X_train.shape[1], X_train.shape[2])
    )
  )
)
model.add(keras.layers.Dropout(rate=0.2))
model.add(keras.layers.Dense(units=1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [15]:
history = model.fit(
    X_train, y_train, 
    epochs=100, 
    batch_size=128, 
    validation_split=0.2,
    shuffle=False
)

Train on 2267528 samples, validate on 566882 samples
Epoch 1/100
Epoch 2/100

KeyboardInterrupt: 

In [None]:
model.save('m1_checkpoint')

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend();

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_train_inv = cnt_transformer.inverse_transform(y_train.reshape(1, -1))
y_test_inv = cnt_transformer.inverse_transform(y_test.reshape(1, -1))
y_pred_inv = cnt_transformer.inverse_transform(y_pred)

In [None]:
plt.plot(np.arange(0, len(y_train)), y_train_inv.flatten(), 'g', label="history")
plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), y_test_inv.flatten(), marker='.', label="true")
plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), y_pred_inv.flatten(), 'r', label="prediction")
plt.ylabel('Bike Count')
plt.xlabel('Time Step')
plt.legend()
plt.show();

In [None]:
plt.plot(y_test_inv.flatten(), marker='.', label="true")
plt.plot(y_pred_inv.flatten(), 'r', label="prediction")
plt.ylabel('Bike Count')
plt.xlabel('Time Step')
plt.legend()
plt.show();

In [None]:
df