# Google Brain - Ventilator Pressure Prediction 
### 1D-Unet with Deep Supervision Approach

Vladimir Simões da Luz Junior

[LinkedIn](https://www.linkedin.com/in/vladimir-simoes-da-luz-junior/)

[GitHub](https://www.linkedin.com/in/vladimir-simoes-da-luz-junior/)


This solution makes reference to [PPG2ABP](https://arxiv.org/abs/2005.01669), that have used a 1 dimensional U-Net with deep supervision to predict the arterial blood pressure waveform from the photopletysmography signal.

We have selected each individual breath from the Google Brain - VPP dataset as one single 1D input array containing the *u_in* time series. The model architecture encodes the feature map from the *u_in* breath signal and decode the feature into the *pressure* signal from the ventilator. 

## Libraries

In [None]:
#!pip install h5py==2.9.0 numpy==1.17.0 tqdm==4.19.5 matplotlib==2.2.3 seaborn==0.9.0 scipy==1.4.1 scikit-learn==0.19.2 tensorflow-gpu==1.15.4 Keras==2.2.4 Keras-Applications==1.0.8 Keras-Preprocessing==1.1.0 --force


In [None]:
import pandas as pd
import numpy as np
#import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import time

## Models 
### Approximation Network - UNetDS64

In [None]:
"""
    Models used in experiments
"""
from tensorflow import keras
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, UpSampling1D, concatenate, BatchNormalization, Activation, add
from tensorflow.keras.models import Model, model_from_json
#from keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau



def UNetDS64(length, n_channel=1):
    """
        Deeply supervised U-Net with kernels multiples of 64
    
    Arguments:
        length {int} -- length of the input signal
    
    Keyword Arguments:
        n_channel {int} -- number of channels in the output (default: {1})
    
    Returns:
        keras.model -- created model
    """
    
    x = 64

    inputs = Input((length, n_channel))
    conv1 = Conv1D(x,3, activation='relu', padding='same')(inputs)
    conv1 = BatchNormalization()(conv1)
    conv1 = Conv1D(x,3, activation='relu', padding='same')(conv1)
    conv1 = BatchNormalization()(conv1)
    pool1 = MaxPooling1D(pool_size=2)(conv1)

    conv2 = Conv1D(x*2,3, activation='relu', padding='same')(pool1)
    conv2 = BatchNormalization()(conv2)
    conv2 = Conv1D(x*2,3, activation='relu', padding='same')(conv2)
    conv2 = BatchNormalization()(conv2)
    pool2 = MaxPooling1D(pool_size=2)(conv2)

    conv3 = Conv1D(x*4,3, activation='relu', padding='same')(pool2)
    conv3 = BatchNormalization()(conv3)
    conv3 = Conv1D(x*4,3, activation='relu', padding='same')(conv3)
    conv3 = BatchNormalization()(conv3)
    pool3 = MaxPooling1D(pool_size=2)(conv3)

    conv4 = Conv1D(x*8,3, activation='relu', padding='same')(pool3)
    conv4 = BatchNormalization()(conv4)
    conv4 = Conv1D(x*8,3, activation='relu', padding='same')(conv4)
    conv4 = BatchNormalization()(conv4)
    pool4 = MaxPooling1D(pool_size=2)(conv4)

    conv5 = Conv1D(x*16, 3, activation='relu', padding='same')(pool4)
    conv5 = BatchNormalization()(conv5)
    conv5 = Conv1D(x*16, 3, activation='relu', padding='same')(conv5)
    conv5 = BatchNormalization()(conv5)
    
    level4 = Conv1D(1, 1, name="level4")(conv5)

    up6 = concatenate([UpSampling1D(size=2)(conv5), conv4], axis=2)
    conv6 = Conv1D(x*8, 3, activation='relu', padding='same')(up6)
    conv6 = BatchNormalization()(conv6)
    conv6 = Conv1D(x*8, 3, activation='relu', padding='same')(conv6)
    conv6 = BatchNormalization()(conv6)
    
    level3 = Conv1D(1, 1, name="level3")(conv6)

    up7 = concatenate([UpSampling1D(size=2)(conv6), conv3], axis=2)
    conv7 = Conv1D(x*4, 3, activation='relu', padding='same')(up7)
    conv7 = BatchNormalization()(conv7)
    conv7 = Conv1D(x*4, 3, activation='relu', padding='same')(conv7)
    conv7 = BatchNormalization()(conv7)
    
    level2 = Conv1D(1, 1, name="level2")(conv7)

    up8 = concatenate([UpSampling1D(size=2)(conv7), conv2], axis=2)
    conv8 = Conv1D(x*2, 3, activation='relu', padding='same')(up8)
    conv8 = BatchNormalization()(conv8)
    conv8 = Conv1D(x*2, 3, activation='relu', padding='same')(conv8)
    conv8 = BatchNormalization()(conv8)
    
    level1 = Conv1D(1, 1, name="level1")(conv8)

    up9 = concatenate([UpSampling1D(size=2)(conv8), conv1], axis=2)
    conv9 = Conv1D(x, 3, activation='relu', padding='same')(up9)
    conv9 = BatchNormalization()(conv9)
    conv9 = Conv1D(x, 3, activation='relu', padding='same')(conv9)
    conv9 = BatchNormalization()(conv9)

    out = Conv1D(1, 1, name="out")(conv9)

    model = Model(inputs=[inputs], outputs=[out, level1, level2, level3, level4])
    
    

    return model




## Load training data and first exploratory data analysis

In [None]:
df_train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
df_train.time_step.value_counts()

In [None]:
unique_breaths = df_train['breath_id'].unique()
num_breaths = len(unique_breaths)
print(num_breaths)

In [None]:
df_train['breath_id'][:500].plot();

In [None]:
breath_lengths = df_train[['id','breath_id']].groupby('breath_id').count()['id']
breath_lengths.unique()

In [None]:
BREATH_LENGTH = breath_lengths.unique()[0]

## R and C
R and C values are constant within each breath (having zero standard deviation)



In [None]:
r_c_std_in_breaths = df_train[['breath_id','R','C']].groupby('breath_id').std()
print(r_c_std_in_breaths['R'].unique())
print(r_c_std_in_breaths['C'].unique())

R has only three distinct values:

In [None]:
r_values = df_train[['breath_id', 'R']].groupby('breath_id').mean()['R']
print(r_values)
print()
print('Unique values:')
print(r_values.value_counts())

r_unique = np.sort(r_values.unique()).astype(int)

So does C:


In [None]:
c_values = df_train[['breath_id', 'C']].groupby('breath_id').mean()['C']
print(c_values)
print()
print('Unique values:')
print(c_values.value_counts())

c_unique = np.sort(c_values.unique()).astype(int)

There is about a factor two scatter in the various R/C combinations.

For R = 20 we see C = 50 most often, for R = 5, 50 we see C = 10 most often.



## Time steps in individual breaths
Take a look at time sampling for the first two breaths. Looks like pretty uniform sampling in time.



In [None]:
rc_values = np.array([
    [r, c, len(df_train[(df_train['R'] == r) & (df_train['C'] == c)])//BREATH_LENGTH] 
    for r in r_unique 
    for c in c_unique
])

x = range(len(rc_values))
plt.bar(x, rc_values[:,2])
plt.xticks(x, [str(r) + '_' + str(c) for r, c in rc_values[:,:2] ])
plt.xlabel('R_C')
plt.ylabel('Number counts')
plt.show()

In [None]:
first_breath  = df_train[df_train['breath_id'] == 1]
second_breath = df_train[df_train['breath_id'] == 2]

x = range(BREATH_LENGTH)
t1 = first_breath['time_step']
t2 = second_breath['time_step']
plt.plot(x, t1)
plt.plot(x, t2, ls = '--')

One time step seems to correspond to about



In [None]:
(max(t1) - min(t1)) / BREATH_LENGTH

The two time series for the first two breaths are not perfectly aligned

In [None]:
plt.plot(t1.values - t2.values);

## What about the target vector "pressure"

In [None]:
# Pressure in first breath
plt.plot(df_train.pressure[:800])

In [None]:
plt.plot(df_train.pressure[:800])

we note a strong correlation between te columns *pressure* and *u_in*

In [None]:
plt.plot(df_train.u_in[:80])

In [None]:
plt.plot(df_train.u_in[:1000])

In [None]:
plt.plot(df_train.u_out[:1000])

Are there outliers in the dataset?

In [None]:
plt.boxplot(df_train.u_in);

In [None]:
pressao = plt.boxplot(df_train.pressure);

In [None]:
percentiles = [item.get_ydata()[1] for item in pressao['whiskers']]

In [None]:
percentiles

## From that we can train our 1D-Unet based on the *u_in* and *pressure* time series

### Fisrt we will prepare our X input *u_in* array

In [None]:
list1 = df_train.u_in.tolist()

In [None]:
#list1

Normalize input

In [None]:
#input_minima = np.min(list1)
#input_maxima = np.max(list1)
#print("Minimum value of X",input_minima)
#print("Maximum values of X",input_maxima)

In [None]:
#list1 = pd.Series(list1)
#list1 -= input_minima                       # normalizing
#list1 /= (input_maxima-input_minima)

In [None]:
plt.plot(list1[:160])

Select each individual breath from input signal

In [None]:
n = 80
X = [list1[i:i + n] for i in range(0, len(list1), n)]

In [None]:
#print("Minimum value of X Normalized",np.min(X))
#print("Maximum values of X Normalized",np.max(X))
plt.plot(X[0])

In [None]:
#X

### Now preparing our Y *pressure* array

In [None]:
list2 = df_train.pressure.tolist()

In [None]:
#list2

Normalize output

In [None]:
#output_minima = np.min(list2)
#output_maxima = np.max(list2)
#print("Minimum value of Y",output_minima)
#print("Maximum values of Y",output_maxima)

In [None]:
#list2 = pd.Series(list2)
#list2 -= output_minima                       # normalizing
#list2 /= (output_maxima-output_minima)

In [None]:
plt.plot(list2[:160])

Select each individual breath from output signal

In [None]:
i = 0
Y = [list2[i:i + n] for i in range(0, len(list2), n)]


In [None]:
#print("Minimum value of Y Normalized",np.min(Y))
#print("Maximum values of Y Normalized",np.max(Y))
plt.plot(Y[0])

In [None]:
#Y

Train and Validation data split

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,Y)

In [None]:
X_train = np.array(X_train)
X_val = np.array(X_val)
y_train = np.array(y_train)
y_val = np.array(y_val)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

## Train Approximation Network

### GPU accelerator config

In [None]:
from tqdm import tqdm
import tensorflow as tf
ACCELERATOR_TYPE = 'GPU'

if ACCELERATOR_TYPE == 'GPU':
    strategy = tf.distribute.MirroredStrategy()
    print("GPU")

### Deep Supervision configs

In [None]:
def prepareLabel(Y):

    """
    Prepare label for deep supervised pipeline
    
    Returns:
        dictionary -- dictionary containing the 5 level ground truth outputs of the network
    """
    
    def approximate(inp,w_len):
        """
        Downsamples using taking mean over window
        
        Arguments:
            inp {array} -- signal
            w_len {int} -- length of window

        Returns:
            array -- downsampled signal
        """
        
        op = []
        
        for i in range(0,len(inp),w_len):
        
            op.append(np.mean(inp[i:i+w_len]))
            
        return np.array(op)

    out = {}
    out['out'] = []
    out['level1'] = []
    out['level2'] = []
    out['level3'] = []
    out['level4'] = []
    
    
    for y in tqdm(Y,desc='Preparing Label for DS'):
    
                                                                    # computing approximations
        cA1 = approximate(np.array(y).reshape(length), 2)

        cA2 = approximate(np.array(y).reshape(length), 4)

        cA3 = approximate(np.array(y).reshape(length), 8)

        cA4 = approximate(np.array(y).reshape(length), 16)
        


                                                                    # populating the labels for different labels
        out['out'].append(np.array(y.reshape(length,1)))
        out['level1'].append(np.array(cA1.reshape(length//2,1)))
        out['level2'].append(np.array(cA2.reshape(length//4,1)))
        out['level3'].append(np.array(cA3.reshape(length//8,1)))
        out['level4'].append(np.array(cA4.reshape(length//16,1)))

    out['out'] = np.array(out['out'])                                # converting to numpy array
    out['level1'] = np.array(out['level1'])
    out['level2'] = np.array(out['level2'])
    out['level3'] = np.array(out['level3'])
    out['level4'] = np.array(out['level4'])
    

    return out

### Fitting loop

In [None]:
import os
import pickle 

length = 80
model = UNetDS64(length, n_channel=1)
mdlName1 = 'UNetDS64'

try:                                                        # create directory to save training model
    os.makedirs('models')
except:
    pass

try:                                                        # create directory to save training history
    os.makedirs('History')
except:
    pass

def train_approximation_network(model,X_train, X_val, y_train, y_val):
    for foldname in range(10):

            print('----------------')
            print('Training Fold {}'.format(foldname+1))
            print('----------------')

          

            Y_train = prepareLabel(y_train)                                         # prepare labels for training deep supervision

            Y_val = prepareLabel(y_val)                                             # prepare labels for training deep supervision



            mdl1 = model          # create approximation network

                                                                                # loss = mse, with deep supervision weights
            mdl1.compile(loss='mean_absolute_error',optimizer='adam',metrics=['mean_squared_error'], loss_weights=[1., 0.9, 0.8, 0.7, 0.6])                                                         

            # Reduce Learning Rate
            lr = ReduceLROnPlateau(monitor="val_out_loss", factor=0.85, 
                               patience=7, verbose=1)
            # Checkpoint callbakc
            checkpoint1_ = ModelCheckpoint(os.path.join('models','{}_model1_fold{}.h5'.format(mdlName1,foldname)), verbose=1, monitor='val_out_loss',save_best_only=True, mode='auto')  
                                                                            
            # Early Stopping to avoid overfitting
            es = EarlyStopping(monitor="val_out_loss", patience=30, 
                           verbose=1, mode="min", 
                           restore_best_weights=True)
            # train approximation network for 100 epochs
            history1 = mdl1.fit(X_train,{'out': Y_train['out'], 'level1': Y_train['level1'], 'level2':Y_train['level2'], 'level3':Y_train['level3'] , 'level4':Y_train['level4']},epochs=100,batch_size=512,
                                validation_data=(X_val,{'out': Y_val['out'], 'level1': Y_val['level1'], 'level2':Y_val['level2'], 'level3':Y_val['level3'] , 'level4':Y_val['level4']}),callbacks=[lr, checkpoint1_, es],verbose=1)

            pickle.dump(history1.history, open('History/{}_model1_fold{}.p'.format(mdlName1,foldname),'wb'))    # save training history


            mdl1 = None                                             # garbage collection

            #time.sleep(300)                                         # pause execution for a while to free the gpu'''
    


In [None]:
train_approximation_network(model, X_train, X_val, y_train, y_val)

## Approximation Model 1 Training/Validation History

In [None]:
fold_val_loss = []
import pickle
# for fold in training history
for fold in range(10):
    # open and load pickle file as bytes
    file = open('./History/UNetDS64_model1_fold{}.p'.format(fold), 'rb')
    history = pickle.load(file)
    
    # print best score and epoch for each fold
    print("Fold: ", fold)
    print("Best model scored: {} \nin epoch: {}".format(np.min(history['val_loss']),np.argmin(history['val_loss'])),"\n")
    
    # append validation loss to select best weights
    fold_val_loss.append((np.min(history['val_loss']),np.argmin(history['val_loss'])))
    
    
    # Plot trainig history loss and val_loss 
    plt.plot(history['loss'][:])
    plt.plot(history['val_loss'][:])
    plt.title('model loss')
    plt.ylabel('MAE loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()
    


### Which fold weight lead to the lowest error metric?

In [None]:

fold_loss = [i[0] for i in fold_val_loss]            # list of val_loss per fold
min_loss = np.min(fold_val_loss)                     # get min val_loss
min_loss_fold = np.argmin(fold_loss)                 # get min val_los fold
_, min_loss_epoch = fold_val_loss[min_loss_fold-1]   # get min val_loss epoch
print("Best combination in fold {}, val loss: {}, epoch: {}".format(min_loss_fold,min_loss,min_loss_epoch))    # Print best model information for prediciton

## Predict on test set

Load test set

In [None]:
df_test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')

Select each individual breath

In [None]:
list1 = df_test.u_in.tolist()
n = 80
X_test = [list1[i:i + n] for i in range(0, len(list1), n)]
plt.plot(X_test[0])

Instantiate model architecture with best fold weights and predict over test dataset

In [None]:
import os
mdl1 = UNetDS64(80)                                             # creating approximation network
path = "./models"
mdl1.load_weights(os.path.join(path,'UNetDS64_model1_fold{}.h5'.format(min_loss_fold)))   # loading weights
Y_test_pred_approximate = mdl1.predict(X_test,verbose=1)            # predicting approximate abp waveform

Concatenate results into a single list

In [None]:
Y_test_pred_approximate = np.array(Y_test_pred_approximate[0])

In [None]:
samples, _,_ = Y_test_pred_approximate.shape
pressure = []
for signal in range(samples):
    breath_pressure = [j for i in Y_test_pred_approximate[signal] for j in i]
    pressure.extend(breath_pressure)

### Create submission file

Load submission csv

In [None]:
sub = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

Assign predicted values to *pressure* column

In [None]:
sub.pressure = pressure
sub.head()

Save submission csv

In [None]:
sub.to_csv('approximation_submission.csv', index=False)