## 0. Introduction
- Updated 10/13/21
- This code is a baseline with Keras LSTM method.
- Fisaish to submit

### 0-1. Libarary

In [None]:
import os
import gc
import glob
import time
import random
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import ExponentialDecay

### 0-2. Debug

In [None]:
# For debug
class Config:
    def __init__(self):
        self.config = 0
        self.debug_size = 754
        self.data_dir = '../input/ventilator-pressure-prediction/'
        self.post_processing = {
                                'max_pressure': 64.82099173863948,
                                'min_pressure': -1.8957442945646408,
                                'diff_pressure': 0.07030215,
                                }
        
config = Config()

## 1 EDA & Preprocessing
### 1-1. Train & Test data

In [None]:
# Dtype Changed for low size data
dtypes = {'id': 'int32',
          'breath_id': 'int32',
          'R' : 'int8',
          'C' : 'int8',
          'time_step': 'float64',
          'u_in': 'float64',
          'u_out': 'int8',
          'pressure': 'float64'}

# Read train CSV data
def read_train():
    train = pd.read_csv(config.data_dir + 'train.csv')
    # Select random breath_id for degug
    if config.config:
        random.seed(2021)
        lst_train = random.sample(set(train['breath_id'].unique()), config.config_size)
        train_tmp = pd.DataFrame()
        for i in lst_train:
            train_tmp = pd.concat([train_tmp, train[train['breath_id'] == i]], axis=0)
        train = train_tmp
    train = train.astype(dtypes)
    return train

# Read test CSV data
def read_test():
    test = pd.read_csv(config.data_dir + 'test.csv')
    # Select random breath_id for degug
    if config.config:
        random.seed(2021)
        lst_test = random.sample(set(test['breath_id'].unique()), config.config_size)
        test_tmp = pd.DataFrame()
        for i in lst_test:
            test_tmp = pd.concat([test_tmp, test[test['breath_id'] == i]], axis=0)
        test = test_tmp
    test = test.astype(dtypes)
    return test  

train = read_train()   
train.head(2)

### 1-2. Exploratory Data Analysis
### Feature
- id - globally-unique time step identifier across an entire file
- breath_id - globally-unique time step for breaths
- R - lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.
- C - lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow.
- time_step - the actual time stamp.
- u_in - the control input for the inspiratory solenoid valve. Ranges from 0 to 100.
- u_out - the control input for the exploratory solenoid valve. Either 0 or 1.
- pressure - the airway pressure measured in the respiratory circuit, measured in cmH2O.

In [None]:
## Describe in exclude id columns
train[train.columns[1:]].describe(include='all').round(3)

### 1-3. Time series data(pressure/ u_in / u_out)
- from [https://www.kaggle.com/kaitohonda/beginner-lgbm](https://www.kaggle.com/kaitohonda/beginner-lgbm)

In [None]:
if config.config:
    fig, ax = plt.subplots(1, 3, figsize=(30, 6))
    sns.set(font_scale=1.2)
    for i, num in enumerate(random.sample(lst_train, 3)):
        df = train[train['breath_id']==num]
        ax2 = ax[i].twinx()

        sns.lineplot(data=df, x='time_step', y='pressure', label='pressure', ax=ax[i])
        sns.lineplot(data=df, x='time_step', y='u_in', label='u_in', ax=ax[i])
        sns.lineplot(data=df, x='time_step', y='u_out', label='u_out', ax=ax2, color='r')

        ax[i].set(xlabel='Timestep', ylabel='pressure, u_in', title=f'breath_id: {num}', xlim=(-0.2, 3.2), ylim=(-5, 105))
        ax[i].legend(loc=(0.75, 0.7))
        ax2.legend(loc=(0.75, 0.6))
    plt.show()

### 1-4. Preprocessing

In [None]:
def log_exp_return(series):
    return np.exp(np.log1p(series).diff(1).fillna(0))

def preprocessing(df):
    # time diff
    df['time_diff'] = df['time_step'].groupby(df['breath_id']).diff(1).fillna(0)
    
    # basic parameter
    df['u_in_ratio'] = df['u_in'].groupby(df['breath_id']).apply(log_exp_return)
    df['area_unit'] = df['u_in'] * df['time_diff']  
    df['area_ratio'] = df['area_unit'].groupby(df['breath_id']).apply(log_exp_return) 

    # Create Time Windows
    def create_time_window(df, time_min, time_max, diff_time):
        feature_dict = {
                        'u_in': [np.std], 
                        'area_unit': [np.std], 
                        'u_in_ratio': [np.prod, np.std],
                        'area_ratio': [np.prod, np.std]
                        }
        for time_stamp in np.arange(time_min, time_max, diff_time):
            df_tmp = df[['time_step'] + list(feature_dict.keys())][(df['time_step'] >= time_stamp - diff_time) & (df['time_step'] < time_stamp)] \
                        .groupby(df['breath_id']).agg(feature_dict)
            df_tmp.columns = ['_'.join(col) for col in df_tmp.columns]
            df = pd.merge(df, df_tmp.add_suffix(f'_{time_stamp}_term').reset_index(), on='breath_id', how='left')
            del df_tmp
            gc.collect()
            time.sleep(1)

        return df
    
    df = create_time_window(df, 0.1, 0.6, 0.1)

    for i in np.arange(1, 5, 1):
        df[f'u_in_lag_fwrd{i}'] = df['u_in'].groupby(df['breath_id']).shift(i).fillna(0)
        df[f'u_in_lag_back{i}'] = df['u_in'].groupby(df['breath_id']).shift(int(-i)).fillna(0)       
        df[f'area_lag_fwrd{i}'] = df['time_diff'] * df[f'u_in_lag_fwrd{i}']
        df[f'area_lag_back{i}'] = df['time_diff'] * df[f'u_in_lag_back{i}']

    # u_in parameter
    df['last_value_u_in'] = df['u_in'].groupby(df['breath_id']).transform('last')
    df['first_value_u_in'] = df['u_in'].groupby(df['breath_id']).transform('first')
    df['u_in_cumsum'] = df['u_in'].groupby(df['breath_id']).cumsum()  
    
    # u_in area
    df['last_value_area'] = df['area_unit'].groupby(df['breath_id']).transform('last')
    df['first_value_area'] = df['area_unit'].groupby(df['breath_id']).transform('first')
    df['area_cumsum'] = df['area_unit'].groupby(df['breath_id']).cumsum()    
        
    df = df.fillna(0)
    
    # u_out parameter
    df['u_out'] = df['u_out'].astype('str')

    # R, C parameter
    df['R'] = df['R'].astype('str')
    df['C'] = df['C'].astype('str')
    df = pd.get_dummies(df, drop_first=True)
    
    return df


target = train["pressure"].values
train = train.drop(["id", "pressure"], axis=1)
train = preprocessing(train)
train = train.drop(['breath_id'], axis=1)
feature_column = train.columns.values
time.sleep(1)

### 1-5. RobustScaler

In [None]:
rs = RobustScaler()
train = rs.fit_transform(train)
print(f'train shape: {train.shape}')

## 2. Keras
### 2-1. Reshape Data
- Data Reshape for Keras

In [None]:
# Reshape (BreathID, Time_step)
target = target.reshape(-1, 80)

# Reshape (BreathID, Time_step, feature)
train = train.reshape(-1, 80, train.shape[-1])
print(train.shape)

### 2-2. Keras Class

In [None]:
class Keras:
    def __init__(self):
        self.models = []
        self.results = []
        self.timeout = 28800
        self.batch_size = 512
        self.n_splits = 3
        self.epoch = 200
        self.es = EarlyStopping(monitor="val_loss", patience=10, verbose=0, mode="min", restore_best_weights=True)
        self.lr = ReduceLROnPlateau(monitor="val_loss", patience=2, verbose=0, factor=0.5, min_lr=1e-8)  
        self.kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=2021)
        
    def create_model(self, n_layer, activation, mid_units, dropout_rate, train):
        inputs = keras.layers.Input(shape=train.shape[-2:])
        x = keras.layers.Bidirectional(keras.layers.LSTM(int(mid_units), return_sequences=True))(inputs)
        for i in range(0, n_layer):
            x = keras.layers.Bidirectional(keras.layers.LSTM(int(mid_units / (2**(i+1))), return_sequences=True))(x)
#         x = keras.layers.Dropout(dropout_rate)(x)
        x = keras.layers.Dense(int(mid_units / (2**(i+2))), activation=activation)(x)
        output = keras.layers.Dense(1)(x)
        model = keras.models.Model(inputs, output) 
        return model
            
    def keras_trial(self, params, train, target):        
        for fold, (trn_idx, val_idx) in enumerate(self.kf.split(train, target)):
            print(f'Fold {fold+1} started at {time.ctime()}')
            model = self.create_model(params["n_layer"], 
                                      params["activation"],                                      
                                      params["mid_units"], 
                                      params["dropout_rate"],
                                      train)
            model.compile(optimizer=params["optimizer"], loss="mae")
            result = model.fit(x=train[trn_idx], 
                               y=target[trn_idx], 
                               batch_size=self.batch_size, 
                               epochs=self.epoch, 
                               verbose=1, 
                               callbacks=[self.lr, self.es], 
                               validation_data=(train[val_idx], target[val_idx])
                              )
            
            self.results.append(result)
            self.models.append(model)        
            
            del result, model
            gc.collect()
            time.sleep(1)   

### 2-3. Keras Trial

In [None]:
keras_inst = Keras()
params = {'n_layer': 3, 'mid_units': 64, 'dropout_rate': 0.01, 'activation': 'selu', 'optimizer': 'adam'}
keras_inst.keras_trial(params, train, target)
del train, target
gc.collect()

### 2-4. Loss & learning ratio

In [None]:
fig, ax = plt.subplots(1, keras_inst.n_splits, figsize=(30, 10))
for i in range(keras_inst.n_splits):
    ax2 = ax[i].twinx()
    ax[i].plot(range(1, len(keras_inst.results[i].history['loss'])+1), np.log(keras_inst.results[i].history['loss']), label="train")
    ax[i].plot(range(1, len(keras_inst.results[i].history['val_loss'])+1), np.log(keras_inst.results[i].history['val_loss']), label="valid")
    ax2.plot(range(1, len(keras_inst.results[i].history['lr'])+1), [x * 1000 for x in keras_inst.results[i].history['lr']], label="lr", color="r", ls="--")
    ax[i].set(xlabel='Epochs', ylabel='Loss')
    ax2.set(ylabel="lr [x1000]")
    ax[i].legend()
plt.tight_layout()
plt.show()

## 3. Submission

In [None]:
# Test does not have "pressure" column
dtypes.pop('pressure')

test = read_test()
test = preprocessing(test)
test = test.drop(["id", 'breath_id'], axis=1)
test = rs.transform(test)
test = test.reshape(-1, 80, test.shape[-1])
test_shape = test.shape
print(test.shape)

test_preds = []
for model in keras_inst.models:
    test_preds.append(model.predict(test).squeeze().reshape(-1, 1).squeeze())

del test
gc.collect()

submission = pd.read_csv(config.data_dir + "sample_submission.csv")[:test_shape[0] * test_shape[1]]
submission["pressure"] = sum(test_preds) / keras_inst.n_splits
submission["pressure"] = np.round((submission["pressure"] - config.post_processing["min_pressure"]) / config.post_processing["diff_pressure"]) * config.post_processing["diff_pressure"] + config.post_processing["min_pressure"]
submission["pressure"] = np.clip(submission["pressure"], config.post_processing["min_pressure"], config.post_processing["max_pressure"])
submission.to_csv('submission_keras.csv', index=False)
print(submission.tail(2))