<div class="alert alert-block alert-info">  
    <h1><strong>👨‍💻 Getting Started with Google Brain - Ventilator Pressure Prediction</strong></h1>
    <i></i>
</div>

# <img src="https://storage.googleapis.com/kaggle-competitions/kaggle/29594/logos/header.png?t=2021-07-29-12-44-09">

# Importing Python Libraries 📕 📗 📘 📙

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import RobustScaler, normalize
from IPython.display import display
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error as mae
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GroupKFold, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.gridspec as gridspec
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow as tf
import optuna
from tensorflow import keras
from tensorflow.keras.optimizers.schedules import ExponentialDecay
import missingno as msno
import scipy.stats as stats 
from scipy.special import boxcox1p
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

<div class="alert alert-block alert-danger">  
    <h1><strong>Loading training data</strong></h1>
    <i></i>
</div>

In [None]:
train_data = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/train.csv")

# Exploratory data analysis of train data

# Five top records of data

In [None]:
train_data.head()

# Five last records of data

In [None]:
train_data.tail()

# Coloumns/features in data

In [None]:
train_data.columns

# Length of data

In [None]:
print('lenght of data is', len(train_data))

# Shape of data

In [None]:
train_data.shape

# Data information

In [None]:
train_data.info()

# Data types of all coloumns

In [None]:
train_data.dtypes

# Checking missing Values

In [None]:
train_data[train_data.isnull().any(axis=1)].head()

# Count of missing values

In [None]:
np.sum(train_data.isnull().any(axis=1))

# Is there any missing values?

In [None]:
train_data.isnull().values.any()

# Counts of missing values in each column

In [None]:
train_data.isnull().sum()

<div class="alert alert-block alert-danger">  
    <h1><strong>Loading testing data</strong></h1>
    <i></i>
</div>

In [None]:
test_data = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/test.csv")
ids_test_data = test_data['id'].values

# Exploratory data analysis of test data

# Five top records of data

In [None]:
test_data.head()

# Five last records of data

In [None]:
test_data.tail()

# Coloumns/features in data

In [None]:
test_data.columns

# Length of data

In [None]:
print('lenght of data is', len(test_data))

# Shape of data

In [None]:
test_data.shape

# Data information

In [None]:
test_data.info()

# Data types of all coloumns

In [None]:
test_data.dtypes

# Checking missing Values

In [None]:
test_data[test_data.isnull().any(axis=1)].head()

# Count of missing values

In [None]:
np.sum(test_data.isnull().any(axis=1))

# Is there any missing values?

In [None]:
test_data.isnull().values.any()

# Counts of missing values in each column

In [None]:
test_data.isnull().sum()

# Looking at the train data missing values.

In [None]:
NANColumns=[]
i=-1
for a in train_data.isnull().sum():
    i+=1
    if a!=0:
        print(train_data.columns[i],a)
        NANColumns.append(train_data.columns[i])

# Looking at the test data missing values.

In [None]:
NANColumns=[]
i=-1
for a in test_data.isnull().sum():
    i+=1
    if a!=0:
        print(test_data.columns[i],a)
        NANColumns.append(test_data.columns[i])

# Hitogram of all columns where we are going to check that how the values of each column distributed with their counts

In [None]:
train_data.hist(figsize=(50,50),bins = 20, color="#107009AA")
plt.title("Features/Columns Distribution with values counts")
plt.show()

## Correlation of Pressure on training data

In [None]:
corr_feat = train_data.corr().nlargest(10,"pressure")["pressure"].index
cmap = np.corrcoef(train_data[corr_feat].values.T)
mask = np.zeros_like(cmap,dtype=bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(20,10))
sns.heatmap(cmap,
            annot=True,
            fmt=".3f",
            annot_kws = {"size":10},
            cmap=sns.cubehelix_palette(),
            xticklabels = corr_feat.values,
            yticklabels = corr_feat.values,
            mask=mask)

<div class="alert alert-block alert-info">  
<h2><center><strong>Data Processing</strong></center></h2>
        
</div>

## Extract the pressure out

In [None]:
y = train_data["pressure"]

## Combining the train and test dataset

In [None]:
all_data = pd.concat([train_data,test_data],axis=0).reset_index(drop=True)

## Drop the pressure & Id columns

In [None]:
all_data = all_data.drop(["pressure","id"],axis=1)

## A function for checking the missing values

In [None]:
def missing_value(df):
    number = df.isnull().sum().sort_values(ascending=False)
    number = number[number > 0]
    percentage = df.isnull().sum() *100 / df.shape[0]
    percentage = percentage[percentage > 0].sort_values(ascending=False)
    return  pd.concat([number,percentage],keys=["Total","Percentage"],axis=1)
missing_value(all_data)

In [None]:
## Bias feature reducer
bias_feat = []
for feat in all_data.columns:
    counts = all_data[feat].value_counts().iloc[0] ## mode value counts
    if counts / len(all_data) * 100 > 99.94:
        bias_feat.append(feat)

bias_feat

In [None]:
## Remove the bias feature from the dataset
all_data = all_data.drop(bias_feat,axis=1)

# Now splitting the data for training and testing with same index ID's

In [None]:
n = len(y)
train_data = all_data[:n]
test_data = all_data[n:]

# Splitting the Train data into 70% for training and 30% for testing 

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(train_data,y,test_size=0.33,random_state=42)
print("Shapes of data: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

<div class="alert alert-block alert-info">  
<h2><center><strong> Building the models for training and testing</strong></center></h2>
        
</div>

In [None]:
## Create an empty list
pipeline_models = []

# Assign all models into the list
seed = 42
models = [Ridge(tol=10,random_state=seed),
          Lasso(tol=1,random_state=seed),
          RandomForestRegressor(random_state=seed),
          ExtraTreesRegressor(random_state=seed),
          GradientBoostingRegressor(),
          DecisionTreeRegressor(),
          KNeighborsRegressor()]

model_names = ["Ridge","Lasso","RFR","ETR","GBoost_Reg","DT_Reg","KNN_Reg"]

## Assign each model to a pipeline
for name, model in zip(model_names,models):
    pipeline = ("Scaled_"+ name,
                Pipeline([("Scaler",StandardScaler()),
                          (name,model)
                         ]))
    pipeline_models.append(pipeline)

<div class="alert alert-block alert-info">  
<h2><center><strong> Training the models</strong></center></h2>
        
</div>

In [None]:
## Create a dataframe to store all the models' cross validation score
evaluate = pd.DataFrame(columns=["model","cv","std"])


## Encoded dataset
for name,model in pipeline_models:
    kfold = KFold(n_splits=7,shuffle=True,random_state=42)
    cv = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=-1, scoring="r2")
    
    row = evaluate.shape[0]
    evaluate.loc[row,"model"] = name
    evaluate.loc[row,"cv"] = round(cv.mean(),3)
    evaluate.loc[row,"std"] = "+/- {}".format(round(cv.std(),4))
    
    evaluate = evaluate.sort_values("cv",ascending=False)

In [None]:
## Visualization
fig, ax = plt.subplots(1,1,sharey=False,figsize=(16,9))

## Encoded dataset
bar = sns.barplot(evaluate["model"], evaluate["cv"],ax=ax,palette = sns.cubehelix_palette(evaluate.shape[0]))
for rec in bar.patches:
    height = rec.get_height()
    ax.text(rec.get_x() + rec.get_width()/2, height*1.02,height,ha="center")
ax.set_title("Cross Validate Score")
ax.set_xticklabels(evaluate["model"].to_list(),rotation =50)

<div class="alert alert-block alert-danger">  
<h2><center><strong> Best Model is Gradient Boosting Regressor</strong></center></h2>
        
</div>

In [None]:
final_model = GradientBoostingRegressor()
final_model = final_model.fit(X_train,y_train)

In [None]:
submission_results = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/sample_submission.csv")
submission_results.iloc[:,1] = np.floor(np.expm1(final_model.predict(test_data)))
submission_results.to_csv('submission_results', index=False)

In [None]:
DEBUG = False

train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

if DEBUG:
    train = train[:80*10000]

In [None]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag'] = df['u_in'].shift(2).fillna(0)
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df = pd.get_dummies(df)
    return df

train = add_features(train)
test = add_features(test)

In [None]:
targets = train[['pressure']].to_numpy().reshape(-1, 80)
train.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)
test = test.drop(['id', 'breath_id'], axis=1)

In [None]:
RS = RobustScaler()
train = RS.fit_transform(train)
test = RS.transform(test)

In [None]:
train = train.reshape(-1, 80, train.shape[-1])
test = test.reshape(-1, 80, train.shape[-1])

In [None]:
EPOCH = 200
BATCH_SIZE = 1024

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

with tpu_strategy.scope():
    kf = KFold(n_splits=5, shuffle=True, random_state=2021)
    test_preds = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
        print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
        X_train, X_valid = train[train_idx], train[test_idx]
        y_train, y_valid = targets[train_idx], targets[test_idx]
        model = keras.models.Sequential([
            keras.layers.Input(shape=train.shape[-2:]),
            keras.layers.Bidirectional(keras.layers.LSTM(300, return_sequences=True)),
            keras.layers.Bidirectional(keras.layers.LSTM(250, return_sequences=True)),
            keras.layers.Bidirectional(keras.layers.LSTM(150, return_sequences=True)),
            keras.layers.Bidirectional(keras.layers.LSTM(100, return_sequences=True)),
            keras.layers.Dense(50, activation='selu'),
            keras.layers.Dense(1),
        ])
        model.compile(optimizer="adam", loss="mae")

        scheduler = ExponentialDecay(1e-3, 400*((len(train)*0.8)/BATCH_SIZE), 1e-5)
        lr = LearningRateScheduler(scheduler, verbose=1)

        #es = EarlyStopping(monitor="val_loss", patience=15, verbose=1, mode="min", restore_best_weights=True)

        model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=EPOCH, batch_size=BATCH_SIZE, callbacks=[lr])
        #model.save(f'Fold{fold+1} RNN Weights')
        test_preds.append(model.predict(test).squeeze().reshape(-1, 1).squeeze())


<div class="alert alert-block alert-success">  
<h1><center><strong> Submitting the predicted pressure on test data</strong></center></h1>
        
</div>

In [None]:
submission["pressure"] = sum(test_preds)/5
submission.to_csv('submission.csv', index=False)

# <img src="https://thumbs.dreamstime.com/t/bright-colorful-thank-you-banner-vector-overlapping-letters-118244535.jpg">