In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Basic Libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor

from sklearn.metrics import r2_score, accuracy_score, roc_auc_score


import warnings
warnings.filterwarnings('ignore')

In [None]:
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

**Importing Dataset**

In [None]:
data_train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
data_test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')

# Data Processing

In [None]:
data_train.head()

In [None]:
data_test.head()

**Feature Description**

|Features|Description|
|:-------:|:----------|
|id|Globally-unique time step identifier across an entire file|
|breath_id|Globally-unique time step for breaths|
|R|Lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.|
|C|Lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow.|
|time_step|The actual time stamp.|
|u_in|The control input for the inspiratory solenoid valve. Ranges from 0 to 100.|
|u_out|The control input for the exploratory solenoid valve. Either 0 or 1.|
|pressure|The airway pressure measured in the respiratory circuit, measured in cmH2O.|

In [None]:
sha = pd.DataFrame({'Rows':[data_train.shape[0],data_test.shape[0]], 'Columns':[data_train.shape[1], data_test.shape[1]]},
                  index = ['Train Dataset', 'Test Dataset'])
sha

In [None]:
data_train.info()
print('====================================')
data_test.info()

**Missing Values**

In [None]:
msno.bar(data_train, figsize = (20,3), fontsize = 12)
plt.grid();

In [None]:
msno.bar(data_test, figsize=(20,3),fontsize=12)
plt.grid();

No Missing Values are observed in both Train and Test Datasets.

**Duplicate Records**

In [None]:
dupstr = data_train.duplicated()
print('Total no of duplicate values in Training Dataset = %d' % (dupstr.sum()))

data_train[dupstr]

In [None]:
dupstst = data_test.duplicated()
print('Total no of duplicate values in Test Dataset = %d' % (dupstst.sum()))

data_test[dupstst]

# Exploratory Data Analysis

In [None]:
DF_train = data_train.copy()
DF_train.drop(['id', 'breath_id'], axis = 1, inplace = True)

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(DF_train.corr(), annot = True, fmt = '3.2f' , annot_kws={'size' : 12}, cmap="Set1")
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.show()

* Only the control input for the exploratory solenoid valve i.e. u_out, showing high positive correlation with time and moderately negative correlation to Pressure.
* u_in has good positive correlation with pressure.

In [None]:
a = 2
b = 3
c = 1

plt.figure(figsize = (20,6))
for col in DF_train.columns:
    plt.subplot(a,b,c)
    sns.boxplot(DF_train[col], palette='Set2')
    plt.xlabel('{}'.format(col), fontsize = 15)
    plt.xticks(fontsize = 12)
    
    c = c+1

plt.tight_layout()

In [None]:
data_train.time_step.value_counts()

In [None]:
unique_breaths = len(data_train.breath_id.unique())
print("No of Breaths:", unique_breaths)

breath_length = data_train.groupby('breath_id').C.count().unique()
if len(breath_length) == 1:
    breath_length = breath_length[0]
print("Breath Length:", breath_length)

In [None]:
r_c_variance = data_train.groupby('breath_id')[['R','C']].var()
r_c_variance

* For each breath R and C values are constant

In [None]:
R_value = data_train[['breath_id', 'R']].groupby('breath_id').mean()['R']
print('Unique change in pressure per change in air flow:')
print(R_value.value_counts())
R = np.sort(R_value.unique()).astype(int)

In [None]:
C_value = data_train[['breath_id', 'C']].groupby('breath_id').mean()['C']
print('Change in volume per change in pressure:')
print(C_value.value_counts())
C = np.sort(C_value.unique()).astype(int)

**R_C combinations**

In [None]:
rc_values = np.array([
    [r, c, len(data_train[(data_train['R'] == r) & (data_train['C'] == c)])//breath_length] 
    for r in R 
    for c in C
])
x = range(len(rc_values))
plt.figure(figsize = (20,5))
plt.bar(x, rc_values[:,2])
plt.xticks(x, [str(r) + '_' + str(c) for r, c in rc_values[:,:2] ])
plt.xlabel('R_C')
plt.ylabel('Number counts')
plt.show()

In [None]:
plt.figure(figsize = (20,12))
plt.subplot(211)
plt.plot(data_train.pressure[:2000], linewidth = 2, color = 'r', label="pressure")
plt.plot(data_train.u_in[:2000], linewidth = 2, color = 'g', label="u_in")
plt.legend()
plt.subplot(212)
plt.plot(data_train.u_out[:2000], linewidth = 3, label="u_out")
plt.legend()

plt.show()

* Pressure rises when u_out=0, then drops when u_out=1
* u_in have a irregular pattern and somehow correlate to pressure.

In [None]:
plt.figure(figsize = (20,6))
plt.plot(data_train.pressure[:240], linewidth = 2, color = 'r', label="pressure")
plt.plot(data_train.u_in[:240], linewidth = 2, color = 'g', label="u_in")
plt.xticks(np.arange(0, 241, 10))
plt.xlim([-1, 241])
plt.legend()
plt.show()

* Each breath cycle is 80 unit long.
* In every cycle, u_in raise shape at 0 and the start decreasing. At 30 it falls sharp to 0 are remains at same from 30 to 45 unit and then raise exponentially and become constant till the end of the each breath cycle.
* Pressure raise when u_in triggered. It falls sharp after the fall of u_in to 0. 
* The pressure does not fall to 0 rather it hold the memory till the next trigger of u_in.

In [None]:
plt.figure(figsize = (20,6))
plt.plot(data_train.time_step[:2000], linewidth = 2, color = 'r', label="time step")
plt.plot(data_train.pressure[:2000], linewidth = 2, color = 'g', label="pressure")
plt.plot(data_train.u_out[:2000], linewidth = 2, color = 'k', label="u_out")
plt.legend()
plt.legend()

plt.show()

**Select Input and Target Feature**

|Features|Importance Level for the Target Prediction|
|:-------:|:----------|
|id|This feature is irrelevant in pressure prediction|
|breath_id|Helps to identify the events, but not relevant for pressure prediction|
|R|For each breath cycle the R remains constant, no variation has been traced, so we should not consider it as necessary feature.|
|C|The similar parrten observed as in R, hence not considered as the required features for the |
|time_step|This feature also not significant for the prediction of target|
|u_in|The only **major feature** that correlate to the pressure.|
|u_out|Behaves like a **switch** in the generation of pressure|

In [None]:
dataTrn = data_train.copy()
dataTrn.drop(columns=['id', 'breath_id', 'R', 'C', 'time_step'], axis = 1, inplace = True)

In [None]:
X = dataTrn.drop(columns='pressure')
X = X[:80000]
y = dataTrn['pressure']
y = y[:80000]

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.30, random_state=1)
print('Dimension of X_train:', X_train.shape,'\nDimension of X_validation:', X_validation.shape,'\nDimension of y_train:', y_train.shape,'\nDimension of y_validation:', y_validation.shape)

# Building Model

### Linear Regression

In [None]:
LR_model = LinearRegression()
LR_model.fit(X_train, y_train)

In [None]:
LR_train_predict = LR_model.predict(X_train)
LR_validation_predict = LR_model.predict(X_validation)

<span style='color:Brown'>**Model Evaluation metrics**</span>

In [None]:
LR_R2_train = r2_score(y_train,LR_train_predict)
LR_R2_validation = r2_score(y_validation,LR_validation_predict)

LR_MAPE_train = mape(y_train,LR_train_predict)
LR_MAPE_validation = mape(y_validation,LR_validation_predict)


In [None]:
resultsLR = pd.DataFrame({'R Squared Score': LR_R2_validation,
                          'MAPE Score':LR_MAPE_validation }
                           ,index=['Linear Regression'])

resultsLR

### Light Gradient Boosted Machine Regressor

In [None]:
LGBMR_model = make_pipeline(StandardScaler(), LGBMRegressor(n_estimators=501))
LGBMR_model.fit(X_train, y_train)

In [None]:
LGBMR_train_predict = LGBMR_model.predict(X_train)
LGBMR_validation_predict = LGBMR_model.predict(X_validation)

<span style='color:Brown'>**Model Evaluation metrics**</span>

In [None]:
LGBMR_R2_train = r2_score(y_train,LGBMR_train_predict)
LGBMR_R2_validation = r2_score(y_validation,LGBMR_validation_predict)

LGBMR_MAPE_train = mape(y_train,LGBMR_train_predict)
LGBMR_MAPE_validation = mape(y_validation,LGBMR_validation_predict)

In [None]:
resultsLGBMR = pd.DataFrame({'R Squared Score': LGBMR_R2_validation,
                          'MAPE Score':LGBMR_MAPE_validation }
                           ,index=['Light Gradient Boosted Machine'])

results2 = pd.concat([resultsLGBMR,resultsLR])
results2

### Random Forest Regressor

In [None]:
RF_model=RandomForestRegressor(n_estimators = 501, random_state=0)
RF_model.fit(X_train, y_train)

In [None]:
RF_train_predict = RF_model.predict(X_train)
RF_validation_predict = RF_model.predict(X_validation)

<span style='color:Brown'>**Model Evaluation metrics**</span>

In [None]:
RF_R2_train = r2_score(y_train,RF_train_predict)
RF_R2_validation = r2_score(y_validation,RF_validation_predict)

RF_MAPE_train = mape(y_train,RF_train_predict)
RF_MAPE_validation = mape(y_validation,RF_validation_predict)

In [None]:
resultsRF = pd.DataFrame({'R Squared Score': RF_R2_validation,
                          'MAPE Score':RF_MAPE_validation }
                           ,index=['Random Forest'])

results3 = pd.concat([resultsRF,results2])
results3

### XGBoost Regressor

In [None]:
XGB_model = XGBRegressor()
XGB_model.fit(X_train, y_train)

In [None]:
XGB_train_predict = XGB_model.predict(X_train)
XGB_validation_predict = XGB_model.predict(X_validation)

<span style='color:Brown'>**Model Evaluation metrics**</span>

In [None]:
XGB_R2_train = r2_score(y_train,XGB_train_predict)
XGB_R2_validation = r2_score(y_validation,XGB_validation_predict)

XGB_MAPE_train = mape(y_train,XGB_train_predict)
XGB_MAPE_validation = mape(y_validation,XGB_validation_predict)

In [None]:
resultsXGB = pd.DataFrame({'R Squared Score': XGB_R2_validation,
                          'MAPE Score':XGB_MAPE_validation }
                           ,index=['XGBoost Regressor'])

results4 = pd.concat([resultsXGB,results3])
results4

# Verdict

Light Gradient Boosted Machine Regressor has been considered as it shows highest R2 value among all the tested models. 

In [None]:
Submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
data_test.drop(columns=['id', 'breath_id', 'R', 'C', 'time_step'], axis = 1, inplace = True)

In [None]:
Final_Prediction = LGBMR_model.predict(data_test)

**Saving the Prediction**

In [None]:
Submission.pressure = Final_Prediction

In [None]:
Submission.head()