# Ventilator Pressure EDA & Simple Heuristics
_By Nick Brooks, September 2021_

**Resources**
- https://github.com/nicapotato/nicaviz
- https://github.com/nicapotato/nicaviz/blob/master/notebooks/Plot_Examples.ipynb

In [None]:
!pip install --user watermark
!pip install nicaviz

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nicaviz
import seaborn as sns
import time
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler, OneHotEncoder

from sklearn.metrics import mean_absolute_error
import itertools
import seaborn as sns
import math

pd.options.display.max_rows = 999
pd.options.display.width = 500
pd.options.display.max_columns = 500
pd.options.display.max_colwidth = 5000

sns.set_style("whitegrid")

%load_ext watermark
%watermark 

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter(action='ignore', category=FutureWarning)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

notebookstart = time.time()
%watermark --iversions

In [None]:
def ae(y_true, y_score):
    return np.abs(y_true - y_score)

def breath_id_plotter(df, plotcols, rows):
    plot_index = np.random.choice(df['breath_id'].unique(), rows*3)
    f,ax = plt.subplots(rows,3, figsize = [16,rows*3.3])
    for i, ax in zip(plot_index, ax.ravel()):
        iti_df = df.loc[df['breath_id'] == i, :]
        info = iti_df.loc[:, ["R","C"]].iloc[0].to_dict()
        iti_df.set_index("time_step")\
            .loc[:, plotcols]\
            .plot(ax=ax, linewidth=2)
        ax.set_title(f'Breath Id {i}\n{info}')
        ax.set_ylabel("Values")
        ax.set_xlabel("Time Step")
        ax.grid(True, lw=1, ls='--', c='.75')
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)

    plt.tight_layout(pad=1)
    plt.show()

def feature_prep_cate_le(xtrain, xtest, categorical_columns):
    categorical_dims = {}
    for col in categorical_columns:
        l_enc = LabelEncoder()
        xtrain[col] = l_enc.fit_transform(xtrain[col].values)
        xtest[col] = l_enc.transform(xtest[col].values)
        categorical_dims[col] = len(l_enc.classes_)
    return xtrain, xtest, categorical_dims

def feature_prep_cate_ohe(xtrain, xtest, categorical_columns):
    ohe_train_all = pd.DataFrame()
    ohe_test_all = pd.DataFrame()
    for col in categorical_columns:
        ohe = OneHotEncoder()
        train_ohe = ohe.fit_transform(
            xtrain[col].values.reshape(-1, 1)).todense()
        test_ohe = ohe.transform(xtest[col].values.reshape(-1, 1)).todense()
        colnames = ['{}_{}'.format(col, x) for x in ohe.categories_[0]]
        ohe_train_all = pd.concat(
            [ohe_train_all, pd.DataFrame(train_ohe, columns=colnames)], axis=1)
        ohe_test_all = pd.concat(
            [ohe_test_all, pd.DataFrame(test_ohe, columns=colnames)], axis=1)
    return ohe_train_all, ohe_test_all

def feature_prep_numeric(xtrain, xtest,  numerical_columns):
    for col in numerical_columns:
        scaler = RS = RobustScaler()
        xtrain[col] = scaler.fit_transform(xtrain[col].values.reshape(-1, 1))
        xtest[col] = scaler.transform(xtest[col].values.reshape(-1, 1))
    return xtrain, xtest

**Data Dictionary:** <br>
- **id** - globally-unique time step identifier across an entire file
- **breath_id** - globally-unique time step for breaths
- **R** - lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.
- **C** - lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow.
- **time_step** - the actual time stamp.
- **u_in** - the control input for the inspiratory solenoid valve. Ranges from 0 to 100.
- **u_out** - the control input for the exploratory solenoid valve. Either 0 or 1.
- **pressure** - the airway pressure measured in the respiratory circuit, measured in cmH2O.

In [None]:
test = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/test.csv")
df = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/train.csv")
df['pressure_uin_diff'] = df['pressure'] - df['u_in']

display(df.head())
display(df.nica.categorical_describe())

In [None]:
categoricals = ["R", "C", "u_out"]
continuous = ['u_in', 'pressure', "pressure_uin_diff"]
timevar = 'time_step'

In [None]:
df.nica.mass_plot(
    plt_set = categoricals,
    columns = 3,
    plottype = "countplot")

In [None]:
df.nica.mass_plot(
    plt_set = continuous,
    columns = 3,
    plottype = "distplot")

print("['u_in', 'pressure'] Correlation: {}".format(df[['u_in', 'pressure']].corr().iloc[0,1]))

In [None]:
for cont in continuous:
    df.nica.pivot_plots(categoricals, cont, np.mean)

In [None]:
breath_id_plotter(df, ['u_in','u_out','pressure'], 2)

## Simple Heuristics

In [None]:
categoricals = ['R', 'C', 'u_out']
group_canditates = []
group_canditates.extend([[x] for x in categoricals])
for r in range(2, len(categoricals)):
    tmp = [list(x) for x in list(itertools.combinations(categoricals, r))]
    group_canditates.extend(tmp)
group_canditates.append(categoricals)

results = {}
for catcol in group_canditates:
    catcolname = "-".join(catcol)
    df[f'{catcolname}_mean_pressure'] = df.groupby(catcol)['pressure'].transform('mean')
    df[f'{catcolname}_mean_pressure_ae'] = np.abs(df[f'{catcolname}_mean_pressure'].values - df['pressure'].values)
    error = df.groupby(catcol)[f'{catcolname}_mean_pressure_ae'].mean().round(2)
    full_error = np.mean(df[f'{catcolname}_mean_pressure_ae'])
    results[catcolname] = (full_error, error)

In [None]:
n_plots = len(results)

cols = 3
rows = math.ceil(n_plots / cols)

fig, ax = plt.subplots(rows, cols, figsize = [cols*4,rows*4])
for key, ax in zip(results, ax.ravel()):
    sns.heatmap(results[key][1].to_frame(), cmap='Greens_r', ax=ax)
    ax.set_title("MAE Pressure for\n{}".format(key))
    ax.set_xlabel("")
    
plt.tight_layout(pad=0)
plt.show()

In [None]:
results_pd = pd.DataFrame(results).T
results_pd.columns = ["score", "groupby_experiments"]
results_pd['groupby_experiments'] = results_pd['groupby_experiments'].apply(lambda x: x.to_dict())
results_pd.sort_values(by='score', ascending=True, inplace=True)
display(results_pd)

In [None]:
breath_id_plotter(df.query("C == 50 & u_out == 1"), ['u_in','u_out','pressure', 'C-u_out_mean_pressure'], 1)

In [None]:
breath_id_plotter(df, ['u_in','u_out','pressure', 'R-C-u_out_mean_pressure'], 1)

In [None]:
cat_features = ['u_out','R','C']
cont_features = ['time_step', 'u_in']

train_ohe, test_ohe = feature_prep_cate_ohe(df, test, cat_features)
train_cont, test_cont = feature_prep_numeric(df[cont_features].copy(), test[cont_features].copy(), cont_features)

In [None]:
train_X, _ = nicaviz.reduce_mem_usage(pd.concat([train_ohe, train_cont], axis = 1))
train_y = df['pressure'].values
test_X, _ = nicaviz.reduce_mem_usage(pd.concat([test_ohe, test_cont], axis = 1))

In [None]:
train_X.head()

In [None]:
with nicaviz.timer("Build Model"):
    ## Linear Regression
    # Create linear regression object
    model = linear_model.LinearRegression()
    score = cross_val_score(model, train_X, train_y, cv=5, scoring='neg_mean_absolute_error')
    print("CV Mean: {:.3f} MAE +/- {:.5f}".format(abs(score.mean()), score.std()))
    
with nicaviz.timer("OOF Pred"):
    pred = cross_val_predict(model, train_X, train_y, cv=5)
    df['linear_model_oof'] = pred
    
# Coeficients
model_full = linear_model.LinearRegression()
model_full.fit(train_X, train_y)
test_pred = model_full.predict(test_X)
test['linear_model_pred'] = test_pred
print("Linear Model Coefficients")
[print("{}: {}".format(name, coef)) for name, coef in zip(train_X.columns, model_full.coef_)];

In [None]:
breath_id_plotter(df, ['u_in','u_out','pressure', 'linear_model_oof'], 2)

## Submissions

In [None]:
# Submission as u_in
submission = test[['id','u_in']].rename(columns = {"u_in": "pressure"})
submission.to_csv("u_in_submission.csv", index=False)
!head u_in_submission.csv

In [None]:
submission = test[['id','linear_model_pred']].rename(columns = {"linear_model_pred": "pressure"})
submission.to_csv("linear_regression_submission.csv", index=False)
!head linear_regression_submission.csv

In [None]:
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))