# Google Brain - Ventilator Pressure Prediction
![](https://storage.googleapis.com/kaggle-competitions/kaggle/29594/logos/header.png?t=2021-07-29-12-44-09&quot)

![](https://raw.githubusercontent.com/google/deluca-lung/main/assets/2020-10-02%20Ventilator%20diagram.svg)

# Data description
id - globally-unique time step identifier across an entire file

breath_id - globally-unique time step for breaths

R - lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.

C - lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow.

time_step - the actual time stamp.

u_in - the control input for the inspiratory solenoid valve. Ranges from 0 to 100.

u_out - the control input for the exploratory solenoid valve. Either 0 or 1.

pressure - the airway pressure measured in the respiratory circuit, measured in cmH2O.

# Import packages

In [None]:
# Install packages
# !pip install boostaroota

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode() # Set notebook mode to work in offline
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from sklearn import model_selection as sk_model_selection
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score
from sklearn import metrics
import optuna
# from boostaroota import BoostARoota
from sklearn.metrics import log_loss
from optuna.samplers import TPESampler
import functools
from functools import partial
import xgboost as xgb
import joblib
import torch
from torch import nn
from torch.utils import data as torch_data
from sklearn import model_selection as sk_model_selection
from torch.nn import functional as torch_functional
import warnings 
warnings.filterwarnings('ignore')
import torch.optim as optim
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler, normalize
import gc

SEED = 42

In [None]:
train_data = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
sample_submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')
test_data = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')

# EDA

In [None]:
# u_in - 0 is completely closed and no air is let in and 100 is completely open
# u_out - the exploratory valve is open (1) or closed (0) to let air out.
print(train_data.shape)
train_data.head()

In [None]:
print('# Breath IDs in train data:', train_data['breath_id'].nunique())
train_data[train_data['breath_id']==1]

In [None]:
print(train_data['breath_id'].nunique())
train_data[train_data['breath_id']==2]

In [None]:
print(train_data['breath_id'].nunique())
train_data[train_data['breath_id']==3]

In [None]:
temp = train_data.groupby(['breath_id']).agg({'R':'nunique','C':'nunique'}).reset_index()
print('# Breath ids with >1 R or >1 C:', temp[(temp['R']>1) | (temp['C']>1)].shape[0])
temp.head()

In [None]:
train_data['R'].unique(), train_data['C'].unique()

In [None]:
temp = train_data.groupby(['breath_id']).size().reset_index().rename(columns = {0:'# Entries'})
print(temp['# Entries'].unique())
temp

In [None]:
print(train_data['id'].nunique(), train_data.shape)
train_data[train_data['id']==1]

In [None]:
train_data.describe()

In [None]:
print(sample_submission.shape)
sample_submission.head()

In [None]:
print(test_data.shape)
print('# Breath IDs in test data:', test_data['breath_id'].nunique())
test_data.head()

In [None]:
temp = test_data.groupby(['breath_id']).agg({'R':'nunique','C':'nunique'}).reset_index()
print('# Breath ids with >1 R or >1 C:', temp[(temp['R']>1) | (temp['C']>1)].shape[0])
temp.head()

In [None]:
test_data['R'].unique(), test_data['C'].unique()

In [None]:
temp = test_data.groupby(['breath_id']).size().reset_index().rename(columns = {0:'# Entries'})
print(temp['# Entries'].unique())
temp

## Data summaries

In [None]:
def interactive_line_chart(Breath_ID):
    # Create traces
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=train_data[train_data['breath_id']==Breath_ID]["pressure"], 
                             x = train_data[train_data['breath_id']==Breath_ID]['time_step'],
                        mode='lines',
                        name='pressure'))
    fig.add_trace(go.Scatter(y=train_data[train_data['breath_id']==Breath_ID]["u_in"], 
                             x = train_data[train_data['breath_id']==Breath_ID]['time_step'],
                        mode='lines',
                        name='u_in'))
    fig.add_trace(go.Scatter(y=train_data[train_data['breath_id']==Breath_ID]["u_out"], 
                             x = train_data[train_data['breath_id']==Breath_ID]['time_step'],
                        mode='lines',
                        name='u_out'))

    # Edit the layout
    fig.update_layout(title='Variation by time step',
                       xaxis_title='Time step',
                       yaxis_title='Value')
    fig.show()
    
w = widgets.interactive(interactive_line_chart, Breath_ID = train_data['breath_id'].unique().tolist())
display(w)

In [None]:
fig = px.histogram(train_data.groupby(['breath_id']).agg({'pressure':'mean'}).reset_index(), x="pressure", nbins=20)
fig.show()

In [None]:
all_pressure = np.sort(train_data.pressure.unique())
PRESSURE_MIN = all_pressure[0].item()
PRESSURE_MAX = all_pressure[-1].item()
PRESSURE_STEP = ( all_pressure[1] - all_pressure[0] ).item()

Insights from EDA:

There are 80 entries for every breath_id. Hence we can consider a sequence of 80 steps to feed data into a LSTM model or we can build tree-based models/perceptron/other models by feeding 80 * features.

R and C are constant for a breath id. Also, there are only 3 unique entries for R and C for all breath ids in train and test datasets.

# Pre-processing data for modelling

In [None]:
# Splitting into train and validation datasets
breath_id_list = train_data['breath_id'].unique().tolist()
df_train, df_valid = sk_model_selection.train_test_split(
    breath_id_list, 
    test_size=0.2, 
    random_state=SEED)

df_train = train_data[train_data['breath_id'].isin(df_train)].reset_index(drop = True)
df_valid = train_data[train_data['breath_id'].isin(df_valid)].reset_index(drop = True)

In [None]:
scaler = MinMaxScaler()
scaler.fit(df_train[['R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']])

In [None]:
# From https://www.kaggle.com/dlaststark/gb-vpp-pulp-fiction
def add_features(df):
    df['cross']= df['u_in'] * df['u_out']
    df['cross2']= df['time_step'] * df['u_out']
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['time_step_cumsum'] = df.groupby(['breath_id'])['time_step'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    print("Step-1...Completed")
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    print("Step-2...Completed")
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_in__mean'] = df.groupby(['breath_id'])['u_in'].transform('mean')
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    print("Step-3...Completed")
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    print("Step-4...Completed")
    
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] /df['count']
    
    df['breath_id_lag']=df['breath_id'].shift(1).fillna(0)
    df['breath_id_lag2']=df['breath_id'].shift(2).fillna(0)
    df['breath_id_lagsame']=np.select([df['breath_id_lag']==df['breath_id']],[1],0)
    df['breath_id_lag2same']=np.select([df['breath_id_lag2']==df['breath_id']],[1],0)
    df['breath_id__u_in_lag'] = df['u_in'].shift(1).fillna(0)
    df['breath_id__u_in_lag'] = df['breath_id__u_in_lag'] * df['breath_id_lagsame']
    df['breath_id__u_in_lag2'] = df['u_in'].shift(2).fillna(0)
    df['breath_id__u_in_lag2'] = df['breath_id__u_in_lag2'] * df['breath_id_lag2same']
    print("Step-5...Completed")
    
    df['time_step_diff'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    df['ewm_u_in_mean'] = (df\
                           .groupby('breath_id')['u_in']\
                           .ewm(halflife=9)\
                           .mean()\
                           .reset_index(level=0,drop=True))
    df[["15_in_sum","15_in_min","15_in_max","15_in_mean"]] = (df\
                                                              .groupby('breath_id')['u_in']\
                                                              .rolling(window=15,min_periods=1)\
                                                              .agg({"15_in_sum":"sum",
                                                                    "15_in_min":"min",
                                                                    "15_in_max":"max",
                                                                    "15_in_mean":"mean"})\
                                                               .reset_index(level=0,drop=True))
    print("Step-6...Completed")
    
    df['u_in_lagback_diff1'] = df['u_in'] - df['u_in_lag_back1']
    df['u_out_lagback_diff1'] = df['u_out'] - df['u_out_lag_back1']
    df['u_in_lagback_diff2'] = df['u_in'] - df['u_in_lag_back2']
    df['u_out_lagback_diff2'] = df['u_out'] - df['u_out_lag_back2']
    print("Step-7...Completed")
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    print("Step-8...Completed")
    
    return df


# print("Train data...\n")
# train = add_features(train_data)

# print("\nTest data...\n")
# test = add_features(test_data)

# del train_data
# del test_data
# gc.collect()

In [None]:
# targets = train[['pressure']].to_numpy()

# train.drop(['id','one','count','pressure',
#             'breath_id_lag','breath_id_lag2','breath_id_lagsame',
#             'breath_id_lag2same'], axis=1, inplace=True)

# test = test.drop(['id','one','count','breath_id_lag',
#                   'breath_id_lag2','breath_id_lagsame',
#                   'breath_id_lag2same'], axis=1)

# print(f"train: {train.shape} \ntest: {test.shape}")

# np.save('x_train.npy', train)
# np.save('y_train.npy', targets)

# np.save('x_test.npy', test)

In [None]:
# train = np.load('../input/ventilator_pressure_prediction_x_train/x_train.npy')
# targets = np.load('../input/ventilator_pressure_prediction_y_train/y_train.npy')
# test = np.load('../input/ventilator_pressure_prediction_x_test/x_test.npy')

# scaler = RobustScaler()

# cols = [x for x in train.columns if x!='breath_id']
# train = scaler.fit_transform(train)
# test = scaler.transform(test)

# print(f"train: {train.shape} \ntest: {test.shape} \ntargets: {targets.shape}")

In [None]:
# print(train.shape)
# train[0][0].shape

In [None]:
class DataRetriever(torch_data.Dataset):
    def __init__(self, breath_id_list, train_flag):
        self.breath_id_list = breath_id_list
        self.train_flag = train_flag
            
    def __len__(self):
        return len(self.breath_id_list)
    
    def __getitem__(self, index):
        breath_id = self.breath_id_list[index]

        formatted_train_data = pd.DataFrame(data = None)

        if self.train_flag:
            formatted_data = train_data[train_data['breath_id']==breath_id][['breath_id']].iloc[0:1,:].reset_index(drop = True)
        else:
            formatted_data = test_data[test_data['breath_id']==breath_id][['breath_id']].iloc[0:1,:].reset_index(drop = True)
            formatted_data['pressure'] = 0
            
        for i in range(0, 80):
            temp = formatted_data[formatted_data['breath_id']==breath_id][['R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']].iloc[i:i+1,:].reset_index(drop = True)
            temp = temp.sort_values(by = ['time_step'], ascending = True)
            temp.columns = [temp.columns[j] + '_' + str(i+1) for j in range(0, len(temp.columns))]
            formatted_data = pd.concat([formatted_data.reset_index(drop = True),temp.reset_index(drop = True)], axis = 1).reset_index(drop = True)
            
        formatted_train_data = pd.concat([formatted_train_data,formatted_data], axis = 0).reset_index(drop = True)
#         cols = [x for x in formatted_train_data.columns if 'time_step_' in x] + [x for x in formatted_train_data.columns if 'R_' in x] + [x for x in formatted_train_data.columns if 'C_' in x] + [x for x in formatted_train_data.columns if 'u_in_' in x] + [x for x in formatted_train_data.columns if 'u_out_' in x]

        X = torch.tensor(np.stack([formatted_train_data[[x for x in formatted_train_data.columns if 'time_step_' in x]].iloc[0], formatted_train_data[[x for x in formatted_train_data.columns if 'R_' in x]].iloc[0], formatted_train_data[[x for x in formatted_train_data.columns if 'C_' in x]].iloc[0], formatted_train_data[[x for x in formatted_train_data.columns if 'u_in_' in x]].iloc[0], formatted_train_data[[x for x in formatted_train_data.columns if 'u_out_' in x]].iloc[0]], axis = 1)).float()
        
        if (self.train_flag):
            return {"X": X, "y": torch.tensor(formatted_train_data[[x for x in formatted_train_data.columns if 'pressure' in x]].iloc[0]).float()}
        else:
            return {"X": X, "id": breath_id}

In [None]:
class DataRetriever_LSTM(torch_data.Dataset):
    def __init__(self, breath_id_list, train_flag):
        self.breath_id_list = breath_id_list
        self.train_flag = train_flag
            
    def __len__(self):
        return len(self.breath_id_list)
    
    def __getitem__(self, index):
        breath_id = self.breath_id_list[index]

        if self.train_flag:
            formatted_data = train_data[train_data['breath_id']==breath_id].sort_values(by = ['time_step'], ascending = True)[['breath_id','R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']].reset_index(drop = True)
        else:
            formatted_data = test_data[test_data['breath_id']==breath_id].sort_values(by = ['time_step'], ascending = True)[['breath_id','R', 'C', 'time_step', 'u_in', 'u_out']].reset_index(drop = True)
            formatted_data['pressure'] = 0
        
        # Scaling
        formatted_data = pd.DataFrame(scaler.transform(formatted_data[['R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']])).reset_index(drop = True)
        formatted_data.columns = ['R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']
        
        X = torch.tensor(np.stack([formatted_data['time_step'], formatted_data['R'], formatted_data['C'], formatted_data['u_in'], formatted_data['u_out']], axis = 1)).float()
        
        if (self.train_flag):
            return {"X": X, "y": torch.tensor(formatted_data['pressure']).float()}
        else:
            return {"X": X, "id": breath_id}

In [None]:
formatted_train_data = pd.DataFrame(data = None)
breath_id_list = train_data['breath_id'].unique().tolist()[:10]

for breath_id in tqdm(breath_id_list):
    formatted_data = train_data[train_data['breath_id']==breath_id][['breath_id']].iloc[0:1,:].reset_index(drop = True)
    for i in range(0, 80):
        temp = train_data[train_data['breath_id']==breath_id][['R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']].iloc[i:i+1,:].reset_index(drop = True)
        temp.columns = [temp.columns[j] + '_' + str(i+1) for j in range(0, len(temp.columns))]
        formatted_data = pd.concat([formatted_data.reset_index(drop = True),temp.reset_index(drop = True)], axis = 1).reset_index(drop = True)
    formatted_train_data = pd.concat([formatted_train_data,formatted_data], axis = 0).reset_index(drop = True)

In [None]:
formatted_train_data = formatted_train_data[[x for x in formatted_train_data.columns if 'pressure' not in x] + [x for x in formatted_train_data.columns if 'pressure' in x]]
formatted_train_data.head()

# Tree-based model

In [None]:
# TBU

# Perceptron

In [None]:
# TBU

# ML models - Logistic regression

# LSTM

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob, device):
        super(LSTMModel, self).__init__()

        self.device = device
        # Defining the number of layers and the nodes in each layer
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        # LSTM layers
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
        )
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)
#         self.relu = nn.ReLU()

    def forward(self, x):
        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        h0.to(self.device)

        # Initializing cell state for first input with zeros
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        c0.to(self.device)
        
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        # Forward propagation by passing in the input, hidden state, and cell state into the model
        out, (hn, cn) = self.lstm(x, (h0.detach().to(self.device), c0.detach().to(self.device)))

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)
        
#         out = self.relu(out)

        return out

In [None]:
class Trainer:
    def __init__(
        self, 
        model, 
        device, 
        optimizer, 
        criterion
    ):
        self.model = model
        self.device = device
        self.optimizer = optimizer
        self.criterion = criterion

        self.best_valid_score = np.inf
        self.n_patience = 0
        self.lastmodel = None
        
    def fit(self, epochs, train_loader, valid_loader, save_path, patience):        
        train_loss_list = []
        val_loss_list = []
        train_mae_list = []
        val_mae_list = []
        
        for n_epoch in range(1, epochs + 1):
            self.info_message("EPOCH: {}", n_epoch)
            
            train_loss, train_mae, train_mse, train_time = self.train_epoch(train_loader)
            valid_loss, valid_mae, valid_mse, valid_time = self.valid_epoch(valid_loader)
            
            self.info_message(
                "[Epoch Train: {}] loss: {:.4f}, mae: {:.4f}, time: {:.4f} s            ",
                n_epoch, train_loss, train_mae, train_time
            )
            
            self.info_message(
                "[Epoch Valid: {}] loss: {:.4f}, mae: {:.4f}, time: {:.4f} s",
                n_epoch, valid_loss, valid_mae, valid_time
            )

            if self.best_valid_score > valid_loss: 
                self.info_message(
                     "Validation loss improved from {:.4f} to {:.4f}. Saved model to '{}'", 
                    self.best_valid_score, valid_loss, self.lastmodel
                )
                self.best_valid_score = valid_loss
                self.save_model(n_epoch, save_path, valid_loss)
                self.n_patience = 0
            else:
                self.n_patience += 1
            
            train_loss_list.append(train_loss)
            val_loss_list.append(valid_loss)
            train_mae_list.append(train_mae)
            val_mae_list.append(valid_mae)
            
            if self.n_patience >= patience:
                self.info_message("\nValidation loss didn't improve last {} epochs.", patience)
                break
                
        return {'train_loss':train_loss_list, 'val_loss':val_loss_list, 'train_mae':train_mae_list, 'val_mae':val_mae_list,'n_epoch':n_epoch}
            
    def train_epoch(self, train_loader):
        self.model.train()
        t = time.time()
        sum_loss = 0
        runnning_mae = 0
        runnning_mse = 0
        
        for step, batch in enumerate(train_loader, 1):
            X = batch["X"].to(self.device)
            targets = batch["y"].to(self.device)
            self.optimizer.zero_grad()
            
            outputs = self.model(X)
            loss = self.criterion(outputs, targets)
            loss.backward()

            sum_loss += loss.detach().item()

            self.optimizer.step()
            
            error = ((torch.abs(outputs - targets).sum(axis = 1)/outputs.shape[1]).sum()/outputs.shape[0]).data
            squared_error = (((((outputs - targets)*(outputs - targets)).sum(axis = 1))/outputs.shape[1]).sum()/(outputs.shape[0])).data
            (torch.abs(outputs - targets).sum(axis = 1)/outputs.shape[1]).sum()/outputs.shape[0]
            runnning_mae += error
            runnning_mse += squared_error
            
            message = 'Train Step {}/{}, train_loss: {:.4f}, train_mae: {:.4f}'
            self.info_message(message, step, len(train_loader), sum_loss/step, runnning_mae/step, end="\r")
        
        return sum_loss/len(train_loader), runnning_mae/len(train_loader), runnning_mse/len(train_loader), int(time.time() - t)
    
    def valid_epoch(self, valid_loader):
        self.model.eval()
        t = time.time()
        sum_loss = 0
        runnning_mae = 0
        runnning_mse = 0
 
        for step, batch in enumerate(valid_loader, 1):
            with torch.no_grad():
                X = batch["X"].to(self.device)
                targets = batch["y"].to(self.device)

                outputs = self.model(X)
                loss = self.criterion(outputs, targets)

                sum_loss += loss.detach().item()
#                 y_all.extend(batch["y"].tolist())

                error = ((torch.abs(outputs - targets).sum(axis = 1)/outputs.shape[1]).sum()/outputs.shape[0]).data
                squared_error = (((((outputs - targets)*(outputs - targets)).sum(axis = 1))/outputs.shape[1]).sum()/(outputs.shape[0])).data
                runnning_mae += error
                runnning_mse += squared_error

            message = 'Valid Step {}/{}, valid_loss: {:.4f}, valid_mae: {:.4f}'
            self.info_message(message, step, len(valid_loader), sum_loss/step, runnning_mae/step, end="\r")
            
        return sum_loss/len(valid_loader), runnning_mae/len(train_loader), runnning_mse/len(train_loader), int(time.time() - t)
    
    def save_model(self, n_epoch, save_path, loss):
        self.lastmodel = f"{save_path}"
        torch.save(
            {
                "model_state_dict": self.model.state_dict(),
                "optimizer_state_dict": self.optimizer.state_dict(),
                "best_valid_score": self.best_valid_score,
                "n_epoch": n_epoch,
            },
            self.lastmodel,
        )
    
    @staticmethod
    def info_message(message, *args, end="\n"):
        print(message.format(*args), end=end)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_dim = 5
output_dim = 80
hidden_dim = 128
layer_dim = 3
batch_size = 128
dropout = 0.02
n_epochs = 30
patient_epochs = 20
learning_rate = 1e-3
weight_decay = 1e-6

model_params = {'input_dim': input_dim,
                'hidden_dim' : hidden_dim,
                'layer_dim' : layer_dim,
                'output_dim' : output_dim,
                'dropout_prob' : dropout,
                'device' : device}

model = LSTMModel(**model_params)

model.to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.L1Loss(reduction='mean')

# train_data_retriever = DataRetriever_LSTM(
#     df_train["breath_id"].unique().tolist(),
#     train_flag = True)

# train_loader = torch_data.DataLoader(
#     train_data_retriever,
#     batch_size=batch_size,
#     shuffle=True,
#     num_workers=8,
# )

# valid_data_retriever = DataRetriever_LSTM(
#     df_valid["breath_id"].unique().tolist(),
#     train_flag = True)

# valid_loader = torch_data.DataLoader(
#     valid_data_retriever,
#     batch_size=batch_size,
#     shuffle=False,
#     num_workers=8,
# )

# trainer = Trainer(
#     model, 
#     device, 
#     optimizer, 
#     criterion
# )

# history = trainer.fit(
#     n_epochs, 
#     train_loader,
#     valid_loader, 
#     f"lstm_model.pth",
#     patient_epochs,
# )

In [None]:
# temp = pd.DataFrame(data = {'Train loss':history['train_loss'],'Validation loss':history['val_loss']}, columns = ['Train loss', 'Validation loss'])
# temp['epoch'] = temp.index + 1

# # Create traces
# fig = go.Figure()
# fig.add_trace(go.Scatter(y=temp["Train loss"], 
#                          x = temp['epoch'],
#                     mode='lines',
#                     name='Train loss'))
# fig.add_trace(go.Scatter(y=temp["Validation loss"], 
#                          x = temp['epoch'],
#                     mode='lines',
#                     name='Validation loss'))
# # Edit the layout
# fig.update_layout(title='Model loss',
#                    xaxis_title='Epoch',
#                    yaxis_title='Loss')
# fig.show()

In [None]:
# temp = pd.DataFrame(data = {'Train MAE':[x.cpu().numpy().item() for x in history['train_mae']],'Validation MAE':[x.cpu().numpy().item() for x in history['val_mae']]}, columns = ['Train MAE', 'Validation MAE'])
# temp['epoch'] = temp.index + 1

# # Create traces
# fig = go.Figure()
# fig.add_trace(go.Scatter(y=temp["Train MAE"], 
#                          x = temp['epoch'],
#                     mode='lines',
#                     name='Train MAE'))
# fig.add_trace(go.Scatter(y=temp["Validation MAE"], 
#                          x = temp['epoch'],
#                     mode='lines',
#                     name='Validation MAE'))
# # Edit the layout
# fig.update_layout(title='Model accuracy',
#                    xaxis_title='Epoch',
#                    yaxis_title='MAE')
# fig.show()

# Predictions

In [None]:
# Load model

model_params = {'input_dim': input_dim,
                'hidden_dim' : hidden_dim,
                'layer_dim' : layer_dim,
                'output_dim' : output_dim,
                'dropout_prob' : dropout,
                'device' : device}

model = LSTMModel(**model_params)

checkpoint = torch.load(f"../input/ventilatorpressurepredictionlstmmodel/lstm_model.pth")
print(checkpoint['best_valid_score'])
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
model.to(device)

In [None]:
# Predictions on validation data
y_pred = []
y_true = []

valid_data_retriever = DataRetriever_LSTM(
    df_valid["breath_id"].unique().tolist(),
    train_flag = True)

valid_loader = torch_data.DataLoader(
    valid_data_retriever,
    batch_size=batch_size*5,
    shuffle=False,
    num_workers=8,
)

for e, batch in enumerate(valid_loader):
    print(f"{e}/{len(valid_loader)}", end="\r")
    with torch.no_grad():
        tmp_res = (model(batch["X"].to(device))).cpu().numpy()
        temp = pd.DataFrame(np.vstack(batch["X"].cpu().numpy()))
        temp['predicted pressure'] = np.concatenate(tmp_res)
        temp['actual pressure'] = np.concatenate(batch["y"].numpy().tolist())
        temp.columns = ['time_step','R', 'C','u_in', 'u_out','predicted pressure','actual pressure']
        y_pred.append(pd.DataFrame(scaler.inverse_transform(temp[['R', 'C', 'time_step', 'u_in', 'u_out', 'predicted pressure']])).iloc[:,5].tolist())
        y_true.append(pd.DataFrame(scaler.inverse_transform(temp[['R', 'C', 'time_step', 'u_in', 'u_out', 'actual pressure']])).iloc[:,5].tolist())

In [None]:
# y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)

# From https://www.kaggle.com/cdeotte/ensemble-folds-with-median-0-153
# y_pred = np.round((y_pred - PRESSURE_MIN)/PRESSURE_STEP ) * PRESSURE_STEP + PRESSURE_MIN
# y_pred = np.clip(y_pred, PRESSURE_MIN, PRESSURE_MAX)

print('MAE for validation data:', np.sum((np.abs((y_pred) - (y_true)).sum(axis = 0))/len(y_pred)))

In [None]:
def interactive_line_chart_for_validation(Breath_ID):
    data_retriever = DataRetriever_LSTM(
    [Breath_ID],
    train_flag = True)
    y_pred = []
    y_true = []
    with torch.no_grad():
        batch = np.expand_dims(data_retriever[0]["X"], axis = 0)
        tmp_res = (model(torch.tensor(batch).float().to(device))).cpu().numpy()
        temp = pd.DataFrame(np.vstack(batch))
        temp['predicted pressure'] = np.concatenate(tmp_res)
        temp['actual pressure'] = data_retriever[0]["y"].numpy().tolist()
        temp.columns = ['time_step','R', 'C','u_in', 'u_out','predicted pressure','actual pressure']
        y_pred.append(pd.DataFrame(scaler.inverse_transform(temp[['R', 'C', 'time_step', 'u_in', 'u_out', 'predicted pressure']])).iloc[:,5].tolist())
        y_true.append(pd.DataFrame(scaler.inverse_transform(temp[['R', 'C', 'time_step', 'u_in', 'u_out', 'actual pressure']])).iloc[:,5].tolist())

    # Create traces
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=temp["actual pressure"], 
                             x = temp['time_step'],
                        mode='lines',
                        name='actual pressure'))
    fig.add_trace(go.Scatter(y=temp["predicted pressure"], 
                             x = temp['time_step'],
                        mode='lines',
                        name='predicted pressure'))

    # Edit the layout
    fig.update_layout(title='Variation by time step',
                       xaxis_title='Time step',
                       yaxis_title='Value')
    fig.show()
    
w = widgets.interactive(interactive_line_chart_for_validation, Breath_ID = train_data['breath_id'].unique().tolist())
display(w)

In [None]:
import gc
# Predictions on test data
y_pred = []
ids = []

test_data_retriever = DataRetriever_LSTM(
    test_data["breath_id"].unique().tolist(),
    train_flag = False)

test_loader = torch_data.DataLoader(
    test_data_retriever,
    batch_size=batch_size*5,
    shuffle=False,
    num_workers=8,
)

for e, batch in enumerate(test_loader):
    print(f"{e}/{len(test_loader)}", end="\r")
    with torch.no_grad():
        tmp_res = (model(batch["X"].to(device))).cpu().numpy()
        temp = pd.DataFrame(np.vstack(batch["X"].cpu().numpy()))
        temp['predicted pressure'] = np.concatenate(tmp_res)
        temp.columns = ['time_step','R', 'C','u_in', 'u_out','predicted pressure']
        y_pred.append(pd.DataFrame(scaler.inverse_transform(temp[['R', 'C', 'time_step', 'u_in', 'u_out', 'predicted pressure']])).iloc[:,5].tolist())
        ids.append(batch['id'])
        gc.collect()
        torch.cuda.empty_cache()

In [None]:
final_output = sample_submission[['id']].sort_values(by = ['id'], ascending = True)
final_output['pressure'] = np.concatenate(y_pred)

final_output.to_csv('orig_submission.csv', index = False)

# From https://www.kaggle.com/cdeotte/ensemble-folds-with-median-0-153
final_output['pressure'] = np.round((final_output.pressure - PRESSURE_MIN)/PRESSURE_STEP ) * PRESSURE_STEP + PRESSURE_MIN
final_output.pressure = np.clip(final_output.pressure, PRESSURE_MIN, PRESSURE_MAX)

final_output.to_csv('clip_submission.csv', index = False)

In [None]:
final_output['breath_id'] = np.concatenate([np.concatenate([[i]*80 for i in x.numpy()]) for x in ids])

In [None]:
print(final_output.shape)
final_output.head()

In [None]:
print('Test data pressure values\n')
display(final_output['pressure'].describe())
print('\nTrain data pressure values\n')
display(train_data['pressure'].describe())

In [None]:
fig = px.histogram(final_output.groupby(['breath_id']).agg({'pressure':'mean'}).reset_index(), x="pressure", nbins=20)
fig.show()