<div align='center'><font size="5" color='#353B47'>Ventilator Pressure Prediction</font></div>
<div align='center'><font size="4" color="#353B47">EDA</font></div>
<br>
<hr>

In [None]:
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
import pandas as pd
import numpy as np
import itertools
import pprint
import random
import os

from plotly.offline import iplot, init_notebook_mode
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
from plotly import tools
import plotly.io as pio

!pip install tslearn
from tslearn.clustering import TimeSeriesKMeans
from sklearn.model_selection import KFold

pp = pprint.PrettyPrinter(indent=4)
pio.templates.default = "plotly_white"

init_notebook_mode()

# <div id="summary">Table of contents</div>
**<font size="2"><a href="#chap1">I. Data info</a></font>**
**<br><font size="2"><a href="#chap2">II. Exploratory Data Analysis</a></font>**
**<br><font size="2"><a href="#chap3">III. Clustering</a></font>**
**<br><font size="2"><a href="#chap4">IV. Preprocessing</a></font>**

In [None]:
# Load train
train = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/train.csv")
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')

--------

# <div id="chap1">I. Data info</div>

**<font color="blue" size="5">Description</font>**

In [None]:
def describe_df(dataframe, train=True):
    if train:
        print(f"Train dataset contains:\n   {dataframe.shape[0]} rows\n   {dataframe.shape[1]} columns\n   {len(dataframe.breath_id.unique())} breath_id\n\n")
    else:
        print(f"Test dataset contains:\n   {dataframe.shape[0]} rows\n   {dataframe.shape[1]} columns\n   {len(dataframe.breath_id.unique())} breath_id\n\n")
    
describe_df(dataframe = train)
describe_df(dataframe = test, train=False)

**<font color="blue" size="5">Stats train</font>**

In [None]:
train.describe()

**<font color="blue" size="5">Insight train</font>**

In [None]:
train.head()

**<font color="blue" size="5">Missing values train</font>**

In [None]:
# Check missing values
train.isnull().sum()

--------

**<font size="2"><a href="#summary">Back to summary</a></font>**

# <div id="chap2">II. Exploratory Data Analysis</div>

This analysis will focus on a sample of trainset, including 62 breath_ids

**<font color="blue" size="5">Reminder</font>**

<font color="blue"><b>id</b></font> - globally-unique time step identifier across an entire file<br><br>
<font color="blue"><b>breath_id</b></font> - globally-unique time step for breaths<br><br>
<font color="blue"><b>R</b></font> - lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.<br><br>
<font color="blue"><b>C</b></font> - lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow.<br><br>
<font color="blue"><b>time_step</b></font> - the actual time stamp.<br><br>
<font color="blue"><b>u_in</b></font> - the control input for the inspiratory solenoid valve. Ranges from 0 to 100.<br><br>
<font color="blue"><b>u_out</b></font> - the control input for the exploratory solenoid valve. Either 0 or 1.<br><br>
<font color="blue"><b>pressure</b></font> - the airway pressure measured in the respiratory circuit, measured in cmH2O.

**<font color="blue" size="5">Time series</font>**

In [None]:
print(f"Unique values of R : {train.R.unique()}")
print(f"Unique values of C : {train.C.unique()}")

#(train['R'].astype(str) + '-' + train['C'].astype(str)).unique()

In [None]:
def plot_sample(dataframe, seed = 42):
    """ Plot time series for each combinations of R and C """
    
    np.random.seed(seed)
    
    cols = ['u_in', 'u_out', 'pressure']

    for (r, c) in list(itertools.product(dataframe.R.unique(), dataframe.C.unique())):
        
        subfig = make_subplots(specs=[[{"secondary_y": True}]])
        
        plot_data = dataframe[(dataframe.R.isin([r]) & dataframe.C.isin([c]))]
        sample_id = plot_data.breath_id.sample(n=1)
        plot_data = plot_data[plot_data.breath_id.isin(sample_id)]

        x_breath_changing_state = plot_data.loc[max(plot_data.loc[plot_data.u_out < 1].index), 'time_step']

        fig1 = px.line()
        fig1.add_scatter(x=plot_data.time_step, y=plot_data.pressure, name='pressure')
        fig1.add_scatter(x=plot_data.time_step, y=plot_data.u_in, name='u_in')
        fig1.add_vline(x_breath_changing_state)
        
        fig2 = px.line()
        fig2.add_scatter(x=plot_data.time_step, y=plot_data.u_out, name='u_out')
        fig2.update_traces(yaxis="y2")

        subfig.add_traces(fig1.data + fig2.data)
        subfig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color)))
                
        subfig.layout.title = f'Sample {sample_id.values[0]} - R={r}, C={c}'
        subfig.layout.yaxis1.title="u_in/pressure Y"
        subfig.layout.yaxis2.title="u_out Y"
        
        subfig.show()
        #title=f'Sample {sample_id.values[0]} - R={r}, C={c}'
        
plot_sample(train)

In [None]:
breath_ids = train.breath_id.sample(n = 5000//80, replace = False)
train_EDA = train.loc[train.breath_id.isin(breath_ids), :].reset_index(drop = True)

In [None]:
fig = px.histogram(
    train_EDA, 
    x="pressure",
    marginal="box",
    color="u_out",
    hover_data=train_EDA.columns,
    nbins = 50
)

fig.update_layout(
    title="Pressure distribution"
)

fig.show()

In [None]:
fig = px.histogram(
    train_EDA, 
    x="u_in",
    marginal="box",
    color="u_out",
    hover_data=train_EDA.columns,
    nbins = 50
)

fig.update_layout(
    title="u_in distribution"
)

fig.show()

In [None]:
dict_data = dict(train_EDA.u_out.value_counts())

fig = go.Figure(
    data=[
        go.Bar(
            x = list(dict_data.keys()),
            y = list(dict_data.values())
        )
    ],
    layout_title_text="u_out distribution",
)

fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 1
    )
)

fig.show()

del dict_data

In [None]:
corr = train_EDA.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig = px.imshow(corr.mask(mask))
fig.show()

del corr, mask

--------

**<font size="2"><a href="#summary">Back to summary</a></font>**

# <div id="chap3">III. Clustering</div>

**<font color="blue" size="5">Display sample of pressure evolution over time_sleep</font>**

In [None]:
def display_ts_examples(dataframe, graph_indexes = np.arange(9)):
    
    # plot first few images
    plt.figure(figsize=(12,12))
    
    for graph_index in graph_indexes:
        
        breath_id = random.choice(dataframe.breath_id.unique())
        
        # define subplot
        plt.subplot(330 + 1 + graph_index)
        plt.title('Breath id: %s \n'%breath_id,
                 fontsize=18)
        # plot raw pixel data
        ts_to_plot = dataframe.loc[dataframe.breath_id == breath_id, ['time_step', 'pressure']]
        pd.Series(ts_to_plot.pressure.values, index=ts_to_plot.time_step.values).plot()
        
    plt.subplots_adjust(bottom = 0.001)  # the bottom of the subplots of the figure
    plt.subplots_adjust(top = 1.25)
    # show the figure
    plt.show()
    
display_ts_examples(train_EDA)

**<font color="blue" size="5">Create clusters with pressure data</font>**

In [None]:
def generate_matrix_cluster(dataframe, n = 300, seed = 42):
    """ Clustering of time series based on dynamic time warp """
    
    np.random.seed(seed)
    matrix = []

    for breath_id in tqdm(breath_ids):
        df_ = dataframe.loc[dataframe.breath_id == breath_id, ['time_step', 'pressure']]
        matrix.append(np.array(pd.Series(df_.pressure.values, index=df_.time_step.values)))
        
    matrix = np.matrix(matrix)[:,:,np.newaxis]
    
    return matrix

def run_clustering(matrix):
    """ Perform KMeans on matrix of time series """
    
    model = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=10)
    model.fit(matrix)
    
    return model

matrix = generate_matrix_cluster(train_EDA)
cluster_p_model = run_clustering(matrix)

> More information about DTW in these insightful articles: <br><p><a href=https://towardsdatascience.com/dynamic-time-warping-3933f25fcdd>DTW</a><br><a href=https://towardsdatascience.com/how-to-apply-k-means-clustering-to-time-series-data-28d04a8f7da3>KMeans Clustering on time series data</a>

**<font color="blue" size="5">Display</font>**

In [None]:
def display_ts_clusters(model, n_clusters=3):
    
    # plot first few images
    plt.figure(figsize=(12,12))
    
    for graph_index in range(n_clusters):
                
        # define subplot
        plt.subplot(330 + 1 + graph_index)
        plt.title('Cluster No: %s \n'%graph_index,
                 fontsize=18)
        
        # plot raw pixel data
        array_cluster = model.cluster_centers_[graph_index]
        pd.Series(array_cluster.ravel()).plot()
        
    plt.subplots_adjust(bottom = 0.001)
    plt.subplots_adjust(top = 1.25)
    plt.show()
    
display_ts_clusters(cluster_p_model)

--------

**<font size="2"><a href="#summary">Back to summary</a></font>**

# <div id="chap4">IV. Preprocessing</div>

**<font color="blue" size="5">Create folds properly</font>**

In [None]:
NB_FOLDS = 5

def create_folds(dataframe):
    """ Return a dataframe with kfold column generated """

    # Initialize kfold column with -1
    dataframe["kfold"] = -1
    y = dataframe.pressure.values

    # Chose the number of folds
    kf = KFold(n_splits = NB_FOLDS)

    # Assigning fold for each observation
    for fold_, (_, val_) in enumerate(
            kf.split(
                X = dataframe,
                y = y,
                groups = dataframe.breath_id.values
            ),1):
        dataframe.loc[val_, "kfold"] = fold_

    return dataframe

In [None]:
train_EDA_with_folds = create_folds(train_EDA)

**<font color="blue" size="5">Create features</font>**

In [None]:
def create_features(dataframe, list_of_features = ['u_in', 'time_step']):
    
    # u_in cumsum
    dataframe['u_in_cumsum'] = dataframe.groupby('breath_id')['u_in'].cumsum()
    
    # u_in shift change 
    for lag in np.arange(1, 3, 1):
        dataframe[f'u_in_lag_fwrd{lag}'] = dataframe.groupby('breath_id')['u_in'].shift(lag).fillna(0)
        dataframe[f'u_in_lag_back{lag}'] = dataframe.groupby('breath_id')['u_in'].shift(int(-lag)).fillna(0)
        
    # time diff
    dataframe['time_diff'] = dataframe.groupby('breath_id')['time_step'].diff(1).fillna(0)
    
    # u_in area
    dataframe['area'] = dataframe['time_step'] * dataframe['u_in']
    dataframe['area'] = dataframe.groupby('breath_id')['area'].cumsum()
    dataframe['u_in_cumsum'] = dataframe.groupby('breath_id')['u_in'].cumsum()
    
    for feature in list_of_features:
    
        grouped_dataframe = dataframe.groupby('breath_id')[feature].agg([max, min, np.mean, np.median])    
        
        dataframe = dataframe.merge(
            grouped_dataframe, 
            how='left', 
            on='breath_id'
        )
        
        dataframe = dataframe.rename(
            columns = {
                'max':feature+'_max', 
                'min':feature+'_min', 
                'mean':feature+'_mean', 
                'median':feature+'_median'
            }
        )
    
        dataframe[f'{feature}_range'] = (dataframe[f'{feature}_max'] - dataframe[f'{feature}_min']).apply(lambda x: max(0,x))
  
    # R, C
    dataframe['RC'] = dataframe['C'] * dataframe['R']
    dataframe['R/C'] = dataframe['R'] / dataframe['C']
    dataframe['C/R'] = dataframe['C'] / dataframe['R']
        
    return dataframe

train_EDA_ft = create_features(train_EDA)
train_EDA_ft.head()

<hr>
<div align='justify'><font color="#353B47" size="4">Thank you for taking the time to read this notebook. I hope that I was able to answer your questions or your curiosity and that it was quite understandable. <u>any constructive comments are welcome</u>. They help me progress and motivate me to share better quality content. I am above all a passionate person who tries to advance my knowledge but also that of others. If you liked it, feel free to <u>upvote and share my work.</u> </font></div>
<br>
<div align='center'><font color="#353B47" size="3">Thank you and may passion guide you.</font></div>