In [None]:
import os
from glob import glob
import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.model_selection import train_test_split, GroupKFold, KFold
from sklearn.metrics import mean_absolute_error

In [None]:
train = pd.read_csv(r"../input/ventilator-pressure-prediction/train.csv")
test = pd.read_csv(r"../input/ventilator-pressure-prediction/test.csv")

In [None]:
display(train.R.value_counts()), print(""); display(train.C.value_counts())

In [None]:
train.corr().columns, train.corr().index

In [None]:
corr = train.corr()
cols = corr.columns
np.fill_diagonal(corr.values, 0)  # Remove diagonal.

hm = plt.imshow(corr)
plt.colorbar(hm)
plt.xticks(ticks=np.arange(len(cols)),labels=cols,rotation=90)
plt.yticks(ticks=np.arange(len(cols)),labels=cols)
plt.grid(True)

Exclude the ID columns. Hmmm, analyzing this graph, I see weak negative (?) correlation between R and C. Everything seems to be centered around the bottom right corner. The time_step feature is somehow highly correlated with the u_out feature. This also seems to be the case with time_step and u_in and pressure in fact (negative correlation: as time progresses pressure decreases; that is a given). C and u_in have a weak positive correlation. u_in and u_out unsurprisingly are negatively correlated. u_in and u_out also correlate with pressure but they are on opposite sides of the spectrum. While u_in correlates to pressure (with medium strength), u_out correlates with pressure with stronger strength (but negatively instead of positively as with u_in). 

In [None]:
train.boxplot(column="pressure", by="R")

In [None]:
train_r50_c10 = train[(train.R == 50) & (train.C == 10)]
train_r5_c10 = train[(train.R == 5) & (train.C == 10)]
train_r20_c10 = train[(train.R == 20) & (train.C == 10)]
print(train_r50_c10.shape, train_r5_c10.shape, train_r20_c10.shape)

train_r50_c50 = train[(train.R == 50) & (train.C == 50)]
train_r5_c50 = train[(train.R == 5) & (train.C == 50)]
train_r20_c50 = train[(train.R == 20) & (train.C == 50)]
print(train_r50_c50.shape, train_r5_c50.shape, train_r20_c50.shape)

train_r50_c20 = train[(train.R == 50) & (train.C == 20)]
train_r5_c20 = train[(train.R == 5) & (train.C == 20)]
train_r20_c20 = train[(train.R == 20) & (train.C == 20)]
print(train_r50_c20.shape, train_r5_c20.shape, train_r20_c20.shape)

Hmm, looks like they are mostly even. R50 and C10 are by far the msot common. R20 and C10 and R20 and C20 seem like the least common. 

In [None]:
print(train_r50_c10.pressure.mean(), train_r5_c10.pressure.mean(), train_r20_c10.pressure.mean())
print(train_r50_c50.pressure.mean(), train_r5_c50.pressure.mean(), train_r20_c50.pressure.mean())
print(train_r50_c20.pressure.mean(), train_r5_c20.pressure.mean(), train_r20_c20.pressure.mean())

In [None]:
print(train_r50_c10[train_r50_c10.u_out == 0].pressure.mean())
print(train_r5_c10[train_r5_c10.u_out == 0].pressure.mean())
print(train_r20_c10[train_r20_c10.u_out == 0].pressure.mean())

print("")

print(train_r50_c50[train_r50_c50.u_out == 0].pressure.mean())
print(train_r5_c50[train_r5_c50.u_out == 0].pressure.mean())
print(train_r20_c50[train_r20_c50.u_out == 0].pressure.mean())

print("")

print(train_r50_c20[train_r50_c20.u_out == 0].pressure.mean())
print(train_r5_c20[train_r5_c20.u_out == 0].pressure.mean())
print(train_r20_c20[train_r20_c20.u_out == 0].pressure.mean())

In [None]:
cnt = 0
for _, df in train_r50_c10.groupby("breath_id"):
    plt.plot(df.time_step, df.pressure)
    if cnt == 10: break
    cnt += 1

In [None]:
fig, ax = plt.subplots()
cnt = 0
for _, df in train_r50_c50.groupby("breath_id"):
    ax.plot(df.time_step, df.pressure)
    if cnt == 10: break
    cnt += 1

In [None]:
fig, ax = plt.subplots()
cnt = 0
for breath_id, df in train_r50_c20.groupby("breath_id"):
    ax.plot(df.time_step, df.pressure, label=breath_id)
    ax.legend()
    if cnt == 10: break
    cnt += 1

Okay, before I just plot like 7 more of these, let's take a look at some of these graphs. Most of them start from the bottom and jump up and then begin to oscillate. There are some graphs that are smooth however. These need further investigation.

EDIT: I added labels to the above graph.

Breath_id 87.

In [None]:
plt.plot(train[train.breath_id == 87].time_step, train[train.breath_id == 87].pressure)

I wonder why this breath id curves? Could it be due to a previous breath id? What about its features?

In [None]:
train.breath_id.unique()[:90]  # There is no breath_id 86.
# I wonder why the breath_ids are not consecutive. 

In [None]:
bid_87 = train[train.breath_id == 87]

In [None]:
bid_87

In [None]:
bid_87_corr = bid_87.corr()
cols = bid_87_corr.columns
np.fill_diagonal(bid_87_corr.values, 0)  # Remove diagonal.
hm = plt.imshow(bid_87_corr)
plt.colorbar(hm)
plt.xticks(ticks=np.arange(len(cols)),labels=cols,rotation=90)
plt.yticks(ticks=np.arange(len(cols)),labels=cols)
plt.grid(True)

Looks much like the first heatmap except the correlations are much stronger. 

# Let's Take a Look At Where Our Model is Falling Short

In [None]:
NUM_FOLDS = 10
seed = 2021

In [None]:
# # models = []
# # for p in glob(r"../input/gvent-10-fold-ft-bilstm-models/*"):
# #     models.append(keras.models.load_model(p))

# def add_features(df):
#     df['area'] = df['time_step'] * df['u_in']
#     df['area'] = df.groupby('breath_id')['area'].cumsum()
    
#     df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
#     df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
#     df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
#     df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
#     df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
#     df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
#     df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
#     df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
#     df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
#     df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
#     df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
#     df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
#     df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
#     df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
#     df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
#     df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
#     df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
#     df = df.fillna(0)
    
#     df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
#     df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
#     df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
#     df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
#     df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
#     df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
#     df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
#     df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
#     df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
#     df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
#     df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
#     df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
#     df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
#     df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
#     df['cross']= df['u_in']*df['u_out']
#     df['cross2']= df['time_step']*df['u_out']
    
#     df['R'] = df['R'].astype(str)
#     df['C'] = df['C'].astype(str)
#     df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
#     df = pd.get_dummies(df)
#     return df

# train_tfm = add_features(train)
# test_tfm = add_features(test)
# targets = train_tfm[['pressure']].to_numpy().reshape(-1, 80)
# train_ids = train_tfm["breath_id"]
# train_time_step = train_tfm["time_step"]
# train_u_out = train_tfm["u_out"]
# train_tfm.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)
# test_ids = test_tfm["breath_id"]
# test_time_step = test_tfm["time_step"]
# test_u_out = test_tfm["u_out"]
# test_tfm = test_tfm.drop(['id', 'breath_id'], axis=1)

# RS = RobustScaler()
# train_tfm = RS.fit_transform(train_tfm)
# test_tfm = RS.transform(test_tfm)

# train_tfm = train_tfm.reshape(-1, 80, train_tfm.shape[-1])
# test_tfm = test_tfm.reshape(-1, 80, train_tfm.shape[-1])

# train_ids_reshaped = train_ids.values.reshape(-1, 80)
# test_ids_reshaped = test_ids.values.reshape(-1, 80)

# train_time_step_reshaped = train_time_step.values.reshape(-1, 80)
# test_time_step_reshaped = test_time_step.values.reshape(-1, 80)

# train_u_out_reshaped = train_u_out.values.reshape(-1, 80)
# test_u_out_reshaped = test_u_out.values.reshape(-1, 80)

In [None]:
# kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seed)
# test_preds = []

# # model = models[0]  # Testing out just fold 0. 

# for fold, (train_idx, test_idx) in enumerate(kf.split(train_tfm, targets)):
#     if fold == 0: continue
#     if fold == 1: continue
#     if fold == 2: continue
#     if fold == 3: continue
#     if fold == 4: continue
#     if fold == 5: continue
#     if fold == 6: continue
#     if fold == 7: continue
#     if fold == 8: continue
    
#     X_train_time_steps, X_valid_time_steps = train_time_step_reshaped[train_idx], train_time_step_reshaped[test_idx]
#     X_train_u_out, X_valid_u_out = train_u_out_reshaped[train_idx], train_u_out_reshaped[test_idx]
    
#     # Predicting on entire fold 0 valid.  
#     model = keras.models.load_model(f"../input/gvent-10-fold-ft-bilstm-models/folds{fold}.hdf5")
    
#     X_train_ids, X_valid_ids = train_ids_reshaped[train_idx], train_ids_reshaped[test_idx]
    
#     X_train, X_valid = train_tfm[train_idx], train_tfm[test_idx]
#     y_train, y_valid = targets[train_idx], targets[test_idx]

#     test_preds.append(model.predict(X_valid).squeeze().reshape(-1, 1).squeeze())
    
# #     np.save(f"FT_bilstm_fold{fold}_preds.npy", test_preds[0])
# #     np.save(f"FT_bilstm_fold{fold}_true.npy", y_valid)
# #     np.save(f"FT_bilstm_fold{fold}_ids.npy", X_valid_ids)
# #     np.save(f"FT_bilstm_fold{fold}_time_steps.npy", X_valid_time_steps)
# #     np.save(f"FT_bilstm_fold{fold}_u_out.npy", X_valid_u_out)
    
#     err_df = pd.DataFrame({
#         "ids": X_valid_ids.flatten(),
#         "time_steps": X_valid_time_steps.flatten(),
#         "u_out": X_valid_u_out.flatten(),
#         "preds": test_preds[0],
#         "true": y_valid.flatten(),
#     })
#     err_df.to_csv(f"fold{fold}_err.csv", index=False)
    
#     del model
#     break

In [None]:
fold_err = {}

for fold in range(10):
    tmp_df = pd.read_csv(f"../input/ft-bilstm-model0-fold0-error-analysis/fold{fold}_err.csv")
    fold_err[fold] = tmp_df

In [None]:
fold0_err_bid_9 = fold_err[0][fold_err[0].ids == 9]

plt.plot(fold0_err_bid_9.time_steps, fold0_err_bid_9.preds, label="preds")
plt.plot(fold0_err_bid_9.time_steps, fold0_err_bid_9.true, label="true")
plt.axvline(fold0_err_bid_9.time_steps[np.argmax(fold0_err_bid_9.u_out)], linestyle="--")
plt.legend()
plt.show()

In [None]:
def ventilation_mae_loss(y_true, y_pred, u_out):
    w = 1 - u_out
    mae = w * np.absolute(y_true - y_pred)
    mae = mae.sum(-1) / w.sum(-1)

    return mae

In [None]:
# seed = 2021  # We already initialized this.
n = 25  # Number of graphs. Changing this number requires changing the amount of subplots.

def plot_fold_errors(fold, seed=None, shuffle=True):
    
    true = fold.true.values.reshape(-1, 80)
    ids = fold.ids.values.reshape(-1, 80)[:, 0]
    preds = fold.preds.values.reshape(-1, 80)
    time_steps = fold.time_steps.values.reshape(-1, 80)
    u_out = fold.u_out.values.reshape(-1, 80)
    
    if shuffle:
        if seed:
            np.random.seed(seed)  # Seed doesn't matter here unless you want the same n graphs.
        
        permutation = np.random.permutation(fold.ids.nunique())        
    
        true = true[permutation]
        ids = ids[permutation]
        preds = preds[permutation]
        time_steps = time_steps[permutation]
        u_out = u_out[permutation]

    plt.figure(figsize=(20, 20))
    for idx in range(1, n+1):
        plt.subplot(5, 5, idx)
        plt.plot(time_steps[idx-1], true[idx-1], label="y_valid")
        plt.plot(time_steps[idx-1], preds[idx-1], label="y_pred")

        vline = time_steps[idx-1][np.argmax(u_out[idx-1])]
        plt.axvline(vline, linestyle="--")

        mae = ventilation_mae_loss(preds[idx-1], true[idx-1], u_out[idx-1])

        plt.title(f"b_id: {ids[idx-1]} | mae: {mae:.4f}")
        plt.legend()

    plt.show()
    
plot_fold_errors(fold_err[0])

For model 0/fold 0, the model is performing quite well on the val fold (I'd presume it is doing as well as all the other 9 folds/models). Remember this is simply a snapshot of 25 breath_ids in the validaton data for fold0 for model0's predictions. The graph's match so well, I could barely tell there was even a blue line to begin with! For most out-of-fold predictions (val predictions for fold0), it predicts pretty well with < 0.1 MAE on u_out==0. Well, to be specific I simply set a range of 80 instead of the actual time step (which might distort how the graph is shaped but won't affect how MAE is calculated. However, there might be a slight problem with how i chose to calculate the MAE. I chose the first 30 time steps instead of all the time steps right before u_out==1 to calculate the MAE. To fix this we can save the time steps and the u_out to get a more accurate representation of the errors.

EDIT: Because the time_steps and u_out features are relevant to the accurate calculation of the MAE, I'll look towards including them. But first, I definitely want to get at least a *rough* estimate of where the MAE is scoring high. 

Hmmm, let's take a look with the seed so we can at least get a reference set of 25 pressure curves to get a feel of what a high MAE and a low MAE looks like. Breath_id 11546 got a high MAE (0.134) for some reason though it looks accurate visually. The other graphs perform extremely well with MAEs around 0.06. 

Anyways, let's compile a list of the best performing breath_ids and the worst performing breath_ids. 

LATEST EDIT: I just set everything up so now you can interactively plot a random assortment of 25 pressure curves for any of the 10 folds. I'll now go through all 10 folds and look at graphs with the highest MAE!

In [None]:
# NUM_FOLDS = 10  # We already initialized it.

def get_highest_maes(fold, threshold=0.16):
    
    true = fold.true.values.reshape(-1, 80)
    ids = fold.ids.values.reshape(-1, 80)[:, 0]
    preds = fold.preds.values.reshape(-1, 80)
    time_steps = fold.time_steps.values.reshape(-1, 80)
    u_out = fold.u_out.values.reshape(-1, 80)
        
    fold_maes = []
    fold_ids = []
        
    for row in range(true.shape[0]):
        mae = ventilation_mae_loss(preds[row], true[row], u_out[row])
        fold_maes.append(mae)
        fold_ids.append(ids[row])
        
    fold_maes = np.array(fold_maes)
    fold_ids = np.array(fold_ids)
    bool_arr = fold_maes > threshold
    
    return pd.DataFrame({
        "ids": fold_ids[bool_arr],
        "mae": fold_maes[bool_arr],
    })

all_fold_maes = {}  # A dictionary where each key is a fold, and the value
# is the df with id and mae score.

for fold in range(NUM_FOLDS):
    all_fold_maes[fold] = get_highest_maes(fold_err[fold])

In [None]:
# all_fold_maes[0].mae.hist()

plt.figure(figsize=(30, 20))
#fig, axes = plt.subplots(2, 5)

for idx in range(1, NUM_FOLDS + 1):
    plt.subplot(2, 5, idx)
    plt.hist(all_fold_maes[idx-1].mae.values)
    plt.title(f"fold{idx-1}")
    plt.xlabel("MAE")
    plt.ylabel("Frequency")
plt.show()

Looks like for all 10 folds and for breath_ids that score an MAE higher than 0.16, it seems like most of them are simply slightly over 0.16 (or at least between 0 and 1). These are all heavily skewed right distributions (THANKFULLY!) but there are some noticeable outliers here and there. For instance, fold3 has an outlier sitting at 10 MAE!

In [None]:
def get_outlier_maes(fold):
    d = fold.describe()
    Q1 = d.loc["25%", "mae"]
    Q3 = d.loc["75%", "mae"]
    IQR = Q3 - Q1
    outlier_thr = Q3 + (1.5 * IQR)  # We don't check the lower end because the dist. is skewed right.
    
    bool_arr = fold.mae.values > outlier_thr
    outliers = fold[bool_arr]
    return outliers.reset_index(drop=True)

In [None]:
outlier_maes = {}
for fold in range(NUM_FOLDS):
    outlier_maes[fold] = get_outlier_maes(all_fold_maes[fold])

In [None]:
for fold in range(NUM_FOLDS):
    print(outlier_maes[fold].shape)  # Most of these folds have roughly 200 breath_ids where the performance
    # is considered as an outlier performance!

Now that's enough, let's plot these outliers!

Note: So we can't plot all the outliers but we can plot some of them. These outliers are at the tip top of the pyramid and are considered greater than 1.5 IQRs from the upper quartile. Let's do some further preprocessing to select the *utmost* outlandish breath_ids. I made all these intermediate steps in the case anyone (including me) wants to toy with the other values. 

In [None]:
outlier_maes[0]

In [None]:
top_k = 25  # Selecting the top 25 most outlandish/largest breath_id MAEs out of all the outliers for that fold. 

def plot_outlier_maes(outlier_maes, fold_err, fold, top_k):
    
    plt.figure(figsize=(20, 20))
    top_k_outliers = outlier_maes[fold].sort_values(ascending=False, by="mae").iloc[:top_k]
    fold_err_fold_top_k_outliers = fold_err[fold][fold_err[fold].ids.isin(top_k_outliers.ids.values)]
    for idx, group_tmp_df in enumerate(fold_err_fold_top_k_outliers.groupby("ids")):
        group = group_tmp_df[0]
        tmp_df = group_tmp_df[1]
        preds = tmp_df.preds.values
        true = tmp_df.true.values
        time_steps = tmp_df.time_steps.values
        u_out = tmp_df.u_out.values

        plt.subplot(5, 5, idx+1)
        plt.plot(time_steps, true, label="y_valid")
        plt.plot(time_steps, preds, label="y_pred")

        vline = time_steps[np.argmax(u_out)]
        plt.axvline(vline, linestyle="--")

        mae = top_k_outliers[top_k_outliers.ids == group]["mae"].values[0]
        plt.title(f"b_id: {group} | mae: {mae:.4f}")
        plt.legend()

    plt.show()

In [None]:
plot_outlier_maes(outlier_maes, fold_err, fold=2, top_k=top_k)

Okay, this is very interesting! If my code doesn't have any bug that is somehow offsetting this, then it seems very apparent that most of these outlier predictions 

To make sure we aren't messing up the code, I'll do a quick sanity check. 

In [None]:
plt.plot(train[train.breath_id == 112036].time_step, train[train.breath_id == 112036].pressure)

# Okay, the y_valid graph makes sense. 

In [None]:
plt.plot(fold_err[1][fold_err[1].ids == 112036].time_steps, fold_err[1][fold_err[1].ids == 112036].preds)
plt.plot(fold_err[1][fold_err[1].ids == 112036].time_steps, fold_err[1][fold_err[1].ids == 112036].true)

Well, assuming the preds are accurate (which I assume them to be), this does seem like an actual issue! 😬

Potential Fixes:

1. Maybe we can try a model for the beginning part of the breath_id?
2. Hmm, not sure what we can do to estimate the beginning climb. There isn't much to base on when inferring when the model only sees 1 point.