In [None]:
import os
import warnings
import pathlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, mean_squared_error, roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report



pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)
sns.set(font="IPAexGothic", style="darkgrid")
warnings.simplefilter('ignore')

In [None]:
train = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")
test = pd.read_csv("../input/ventilator-pressure-prediction/test.csv")
sample_submission = pd.read_csv("../input/ventilator-pressure-prediction/sample_submission.csv")

## Columns
* id - globally-unique time step identifier across an entire file
* breath_id - globally-unique time step for breaths
* R - lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.
* C - lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow.
* time_step - the actual time stamp.
* u_in - the control input for the inspiratory solenoid valve. Ranges from 0 to 100.
* u_out - the control input for the exploratory solenoid valve. Either 0 or 1.
* pressure - the airway pressure measured in the respiratory circuit, measured in cmH2O.


In [None]:
#data set 
print("train set")
print("shape : ", train.shape)
n_train = len(train)
display(train.head())
print("test set")
print("shape : ", test.shape)
n_test = len(test)
display(test.head())

nunique_train = train.nunique().to_frame()
nunique_train.columns = ["train_set"]
nunique_test = test.nunique().to_frame()
nunique_test.columns = ["test_set"]

print("The number of unique values")
display(pd.merge(nunique_train, nunique_test, right_index=True, left_index=True))

### check missing data

In [None]:
print("=====The Number of missing data")
print("train set")
display(train.isnull().sum(axis=0).to_frame())

print("test set")
display(test.isnull().sum(axis=0).to_frame())

### breath and time

In [None]:
#Check duplicates
if n_train != len(train[["breath_id", "time_step"]].drop_duplicates()):
    print("breath_id and time_step of train set are not unique.")
if n_test != len(test[["breath_id", "time_step"]].drop_duplicates()):
    print("breath_id and time_step of test_set are not unique.")
    
#The number of time_step of each breath_id
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
train.groupby("breath_id")["time_step"].count().hist(ax=ax1)
ax1.set_title("Train set")
ax1.set_xlabel("Number of time_step")
ax1.set_ylabel("Counts")

test.groupby("breath_id")["time_step"].count().hist(ax=ax2)
ax2.set_title("Test set")
ax2.set_xlabel("Number of time_step")
ax2.set_ylabel("Counts")

plt.show()
plt.close()

Each breath id has 80 time_steps

### Distribution of time step

In [None]:
train.groupby(["breath_id"])["time_step"].agg(["min", "mean", "max"]).describe()

The longest breath is about 3 seconds.


Time range of each state of the exploratory solenoid valve 

In [None]:
train.groupby(["u_out"])["time_step"].agg(["min", "mean", "max"])

### R and C

The number of combinations of R and C of one breath_id

In [None]:
(train.groupby("breath_id")[["R", "C"]].nunique() >= 2).sum()

Each breath has one combination of R and C

In [None]:
from itertools import product
def show_heatmap(data) -> None:
    """
    Show heatmap of 2d-array
    
    Parameters
    -----
    data : pd.DataFrame
        plot data 
    """
    
    #Parameters 
    fig, ax = plt.subplots()
    cmap="Greens"
    text_color=("black", "white")
    fontsize=12
    
    im = ax.imshow(data, cmap=cmap)
    th = im.norm(data.values.max())/2
    for i, j in product(range(len(data.index)), range(len(data.columns))):
        color= text_color[int(im.norm(data.iloc[i, j]) > th)]
        plt.text(j, i, str(data.iloc[i, j])
             , horizontalalignment="center"
             , verticalalignment="center"
             , color=color
            , fontsize=fontsize
        )

    cbar = fig.colorbar(im)
    cbar.ax.set_ylabel("Counts", rotation=-90, va="bottom", fontsize=fontsize)

    ax.tick_params(top=True, bottom=False,
                       labeltop=True, labelbottom=False)
    ax.xaxis.set_label_position("top")
    ax.set_xticks(range(len(data.index)))
    ax.set_xticklabels(data.index, fontsize=fontsize)
    ax.set_xlabel("R")
    ax.set_yticks(range(len(data.columns)))
    ax.set_yticklabels(data.columns, fontsize=fontsize)
    ax.set_ylabel("C")
    
    ax.grid(False)
    
    fig.tight_layout()
    plt.show()
    plt.close()

In [None]:
print("===== Train set")
tmp = train[["R", "C"]].value_counts().sort_index()
tmp = tmp.unstack()
show_heatmap(tmp)

print("===== Test set")
tmp = test[["R", "C"]].value_counts().sort_index()
tmp = tmp.unstack()
show_heatmap(tmp)

### Select some breath and Visualize the change with time_step

In [None]:
n_breath = 3
breath_list = np.random.choice(train["breath_id"].values, n_breath, replace=False)

print(f"Selected breath ids are {', '.join(breath_list.astype(str))}")

check_breath_df = train[train["breath_id"].isin(breath_list)]
display(check_breath_df.head())

print("The number of unique values of each breath")
display(check_breath_df.groupby("breath_id").nunique())

plot `u_in`, `u_out` and `pressure` of a single breath from beginning to end.

In [None]:
for y in ["u_in", "u_out", "pressure"]:
    fig, ax = plt.subplots(1, 1, figsize=(18, 6))
    sns.lineplot(
        x="time_step"
        , y=y
        , data=check_breath_df
        , hue="breath_id"
        , palette="Set1"
    )
    
    ax.set_title(y, fontsize=16)
    
fig.tight_layout()

plt.show()
plt.close()

### Pressure

In [None]:
#Negative pressure
negative_pressure_records = len(train[train["pressure"] < 0])
print("Number of records of pressure < 0 : ", negative_pressure_records)
print(f"Ratio of records of pressure < 0 : {100 * negative_pressure_records / len(train):.4f} %")

I don't know if those records are correct.

Distribution of pressure

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(18, 12))
sns.histplot(
    x="pressure"
    , data=train
    , ax=ax1
)
ax1.set_title("Distiribution of pressure", fontsize=15)

sns.histplot(
    x="pressure"
    , data=train
    , ax=ax2
    , hue="u_out"
)
ax2.set_title("Distiribution of pressure of each u_out", fontsize=15)

plt.show()
plt.close()

* There are differnet trends for each u_out.
* There are some spikes.


In [None]:
#Rank the records by time_step
train["time_step_idx"] = train.groupby("breath_id")["time_step"].rank().astype(int)

"""
Check distribution of time_step for each time_step_idx
"""

desc_df = train.groupby("time_step_idx").describe()["time_step"]

desc_df.head()

print("min_std : ", desc_df["std"].min())
print("max_std : ", desc_df["std"].max())

fig, ax = plt.subplots(figsize=(10, 10))
desc_df[["mean", "std", "min", "25%", "50%", "75%", "max"]].plot(ax=ax)

ax.set_title("Statistic value of time_step and time_step_idx")
ax.set_xlabel("time_step_idx")
ax.set_ylabel("time_step")

plt.show()
plt.close()

### Time_step_idx vs pressure

In [None]:
fig, ax = plt.subplots(figsize=(18, 6))
sns.lineplot(
    x="time_step_idx"
    , y="pressure"
    , hue="u_out"
    , err_style="bars"
    , data=train
    , ax=ax
)

plt.show()
plt.close()

## About time_step = 0

In [None]:
time_step0_df = train.query("time_step == 0")
time_step0_df["hue"] ="R" + time_step0_df["R"].astype(str) + ":C" + time_step0_df["C"].astype(str)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

ax.hist(time_step0_df["pressure"], bins=20,  alpha=0.4, label="time_step=0")
ax.set_xlabel("Pressure")

ax_all = ax.twinx()
ax_all.hist(train["pressure"], bins=20, color="darkorange", alpha=0.4, label="All")

fig.legend()

plt.show()
plt.close()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

sns.scatterplot(
    x = "u_in"
    , y="pressure"
    , hue="hue"
    , data=time_step0_df
    , ax=ax
)

ax.set_title("u_in vs pressure")

plt.show()
plt.close()

Records of R is 50 and R is 10 are low pressure.