In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
df_test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')

In [None]:
df_train['seq'] = df_train.sort_values(['breath_id', 'time_step']).groupby('breath_id')['id'].cumcount()
df_test['seq'] = df_test.sort_values(['breath_id', 'time_step']).groupby('breath_id')['id'].cumcount()

## Summary Statistics

In [None]:
print ('Training Set')
print (f'Row Count:{df_train.shape[0]}')
print (f'Number of Unique IDs: {df_train.id.nunique()}')
print (f'Number of Unique Breath IDs: {df_train.breath_id.nunique()}')

print ('-'*20)

print ('Test Set')
print (f'Row Count:{df_test.shape[0]}')
print (f'Number of Unique IDs: {df_test.id.nunique()}')
print (f'Number of Unique Breath IDs: {df_test.breath_id.nunique()}')

In [None]:
df_train.head()

In [None]:
df_train.describe()

In [None]:
df_test.head()

In [None]:
df_test.describe()

**How many timesteps does each breath_id contain?**  
All breath_id has 80 timesteps

In [None]:
fig, ax = plt.subplots()
ax = sns.histplot(data = df_train.groupby('breath_id', as_index = False)['id'].count(), x = 'id', discrete = 1, ax = ax)
ax.set_xlabel('Number of timesteps')
ax.set_ylabel('Number of Breathe IDs')
ax.set_title('Train Set')

In [None]:
fig, ax = plt.subplots()
ax = sns.histplot(data = df_test.groupby('breath_id', as_index = False)['id'].count(), x = 'id', discrete = 1, ax = ax)
ax.set_xlabel('Number of timesteps')
ax.set_ylabel('Number of Breathe IDs')
ax.set_title('Test Set')

## Train vs Test Feature Distribution

### R Distribution
> **R** - lung attribute indicating **how restricted the airway** is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0] = sns.histplot(data = df_train, x = 'R', ax=ax[0], color = 'green', stat = 'percent')
ax[1] = sns.histplot(data = df_test, x = 'R', ax=ax[1], color = 'blue', stat = 'percent')
fig.suptitle('R Distribution')
fig.tight_layout()

### C Distribution

> **C** - lung attribute indicating **how compliant the lung** is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0] = sns.histplot(data = df_train, x = 'C', ax=ax[0], color = 'green', stat = 'percent')
ax[1] = sns.histplot(data = df_test, x = 'C', ax=ax[1], color = 'blue', stat = 'percent')
fig.suptitle('C Distribution')
fig.tight_layout()

### u_in Distribution
> **u_in** - the control input for the inspiratory solenoid valve. Ranges from 0 to 100.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0] = sns.histplot(data = df_train, x = 'u_in', ax=ax[0], color = 'green', stat = 'percent', binwidth = 10)
ax[1] = sns.histplot(data = df_test, x = 'u_in', ax=ax[1], color = 'blue', stat = 'percent', binwidth = 10)
ax[0].set_ylim(0,100)
ax[1].set_ylim(0,100)
fig.suptitle('u_in Distribution')
fig.tight_layout()

### u_out Distibution
> **u_out** - the control input for the exploratory solenoid valve. Either 0 or 1.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0] = sns.histplot(data = df_train, x = 'u_out', ax=ax[0], color = 'green', stat = 'percent')
ax[1] = sns.histplot(data = df_test, x = 'u_out', ax=ax[1], color = 'blue', stat = 'percent')
fig.suptitle('u_out Distribution')
fig.tight_layout()

### time_step Distribution
> **time_step** - the actual time stamp.

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0] = sns.histplot(data = df_train, x = 'time_step', ax=ax[0], color = 'green', stat = 'percent', binwidth = 0.1)
ax[1] = sns.histplot(data = df_test, x = 'time_step', ax=ax[1], color = 'blue', stat = 'percent', binwidth = 0.1)
fig.suptitle('time_step Distribution')
fig.tight_layout()

## Pressue Distribution
**This is the target variable**
> **pressure** - the airway pressure measured in the respiratory circuit, measured in cmH2O.

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax = sns.histplot(data = df_train, x = 'pressure', ax=ax, color = 'green', stat = 'percent', binwidth = 5)
fig.suptitle('Pressure Distribution')
fig.tight_layout()

#### R & C vs Pressure

In [None]:
_data = pd.pivot_table(data = df_train, index = 'R', columns = 'C', values = 'pressure', aggfunc = 'mean')
sns.heatmap(_data, square = True)

#### u_in vs Pressure

In [None]:
fig, ax = plt.subplots()
ax = sns.scatterplot(data = df_train, x = 'u_in', y = 'pressure', ax = ax)

#### u_out vs Pressure

In [None]:
fig, ax = plt.subplots()
ax = sns.boxplot(data = df_train, x = 'u_out', y = 'pressure', ax = ax)

In [None]:
fig, ax = plt.subplots()
ax = sns.lineplot(data = df_train, x = 'seq', y = 'pressure', ax = ax)

# Dummy Baseline

Using mean and median values

In [None]:
df_mean = df_train.groupby('seq', as_index = False).agg({'pressure':'mean'})
df_median = df_train.groupby('seq', as_index = False).agg({'pressure':'median'})

In [None]:
df_sub_mean = pd.merge(df_test, df_mean, on = 'seq', how = 'left').loc[:, ['id', 'pressure']]
df_sub_median = pd.merge(df_test, df_median, on = 'seq', how = 'left').loc[:, ['id', 'pressure']]

df_sub_mean.to_csv('submission_mean.csv', index = False)
df_sub_median.to_csv('submission_median.csv', index = False)