Hi Kagglers!!

In this notebook I'd like to learn keras functional API to help me build a model and I hope you guys will learn that with me. I also will use encoder from autoencoder as data preperation, than use simple ANN as a baseline. After that I will try to make my model more robust and wrap it all in cross validation.

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv", index_col="id")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv", index_col='id')
sub_sample_df = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv")

## Simply EDA

In the last competitions I've spent more time in EDA than on modelling, therefore in this notebook my EDA will stay simple for now. I hope you forgive me this shortcut but my goal is to learn Keras Functional API and I want to see if using aoutoencoder can help to improve a model with artificially created dataset with noise.

In [None]:
train_df

In [None]:
train_df.describe().T.style.background_gradient(subset=['mean'], cmap='coolwarm').background_gradient(subset=['std'], cmap='inferno')

In [None]:
plt.figure(figsize=(14,5))
target_values_sr = train_df['loss'].value_counts()
sns.barplot(x=target_values_sr.index, y=target_values_sr.values, palette='coolwarm')
plt.title("Target unique values", fontdict={'fontsize':20});

In [None]:
corr_mat = train_df.corr()
plt.figure(figsize=(25,6))
corr_mat['loss'][:-1].plot(kind='bar', grid=True)
plt.title("Features correlation to target label", fontdict={'fontsize':20});

The correlation between features and target is very weak almost doesn't exists. Correlation between features very weak too.

## Create an autoencoder with Keras Functional API - autoencoder as data preperation

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import ReLU
from tensorflow.keras.layers import BatchNormalization, LayerNormalization
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Conv1D, Flatten, MaxPooling1D, Embedding

In [None]:
X = train_df.iloc[:,:-1]
y = train_df.iloc[:,-1]

In [None]:
# Split our data
Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size=0.2, random_state=45)

# Scale it
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xvalid_scaled = scaler.transform(Xvalid)

In [None]:
N_INPUTS = X.shape[1]

In [None]:
def create_autoencoder():
    # Define an encoder
    visible = Input(shape=(N_INPUTS,))
    e = Dense(N_INPUTS*2)(visible)
    e = BatchNormalization()(e)
    e = ReLU()(e)
    # Define a bottleneck
    n_bottleneck = N_INPUTS
    bottleneck = Dense(n_bottleneck)(e)
    # Define decoder
    d = Dense(N_INPUTS*2)(bottleneck)
    d = BatchNormalization()(d)
    d = ReLU()(d)
    # Output layer
    output = Dense(N_INPUTS, activation='linear')(d)

    # Define autoencoder model
    autoenc = Model(inputs=visible, outputs=output)
    # Compile model
    autoenc.compile(optimizer='adam',loss='mse')
    
    return autoenc, visible, bottleneck

In [None]:
# instantiate autoencoder
autoenc_m, visible, bottleneck = create_autoencoder()
# Plot model
plot_model(autoenc_m, show_shapes=True)
# Fit the autoencoder model to reconstruct inputs
history = autoenc_m.fit(Xtrain_scaled, 
                        Xtrain_scaled, 
                        epochs=50, 
                        verbose=2, 
                        validation_data=(Xvalid_scaled, Xvalid_scaled))

In [None]:
history_df = pd.DataFrame(history.history)

history_df.plot(figsize=(10,5))

In [None]:
# Define an encoder model (without the decoder)
encoder = Model(inputs=visible, outputs=bottleneck)
# Save the model
encoder.save('encoder.h5')

In [None]:
# Plot the model
plot_model(encoder, 'encoder.png', show_shapes=True)

## Base model

Now, we can use saved encoder from autoencoder model to compress input data and train a different predictive model. First, let's establish a baseline in performance on this problem. As a good practice, I will scale both the input variables and target variable prior to fitting and evaluating the model. I've already scaled train and test variables, now I only have to scale target variable.

In [None]:
scaler_out = StandardScaler()
# Reshape our target series
ytrain = np.array(ytrain).reshape((len(ytrain),1))
yvalid = np.array(yvalid).reshape((len(yvalid),1))

ytrain_scaled = scaler_out.fit_transform(ytrain)
yvalid_scaled = scaler_out.transform(yvalid)

In [None]:
# Define a model
def create_ANN():
    
    input_lyr = Input(shape=(N_INPUTS,))
    x = Dense(100, activation='relu')(input_lyr)
    x = Dense(100, activation='relu')(x)
    x = Dense(50, activation='relu')(x)
    output_lyr = Dense(1)(x)
    
    model = Model(inputs=input_lyr, outputs=output_lyr, name='baseline_model')
    # model.summary()
    
    return model
    
ann_model = create_ANN()
plot_model(ann_model, show_shapes=True)

In [None]:
# Compile and Fit base model
ann_model.compile(optimizer='adam', loss='mse')
history_base = ann_model.fit(Xtrain_scaled, 
                             ytrain_scaled,
                             epochs=20,
                             validation_data=(Xvalid_scaled,yvalid_scaled))

In [None]:
history_base_df = pd.DataFrame(history_base.history)

history_base_df.plot(figsize=(10,5))

In [None]:
# Make prediction
y_pred_base = ann_model.predict(Xvalid_scaled)
y_pred_base = scaler_out.inverse_transform(y_pred_base)
yval = scaler_out.inverse_transform(yvalid_scaled)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(yval, y_pred_base))
print(f"Base Model RMSE= {rmse}")

As we can see our model is not a good model, so we need to address that. Let's see if data prepared via autoencoder can help us.

In [None]:
# Load the encoder from file
encoder = load_model("encoder.h5")
# Encode training dataset
Xtrain_enc = encoder.predict(Xtrain_scaled)
# 
Xvalid_enc = encoder.predict(Xvalid_scaled)

ann_enc_model = create_ANN()
ann_enc_model.compile(optimizer="adam", loss='mse')
hist_ann_enc = ann_enc_model.fit(Xtrain_enc,
                                 ytrain_scaled,
                                 epochs=20,
                                 validation_data=(Xvalid_enc, yvalid_scaled))

In [None]:
hist_enc_df = pd.DataFrame(hist_ann_enc.history)

hist_enc_df.plot(figsize=(10,5))

In [None]:
# Make prediction
y_pred_enc = ann_enc_model.predict(Xvalid_enc)
y_pred_enc = scaler_out.inverse_transform(y_pred_enc)
yval = scaler_out.inverse_transform(yvalid_scaled)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(yval, y_pred_enc))
print(f"Base Model RMSE= {rmse}")

Our model behaviour is much better with data transformed by encoder from autoencoder. Now it would be a good time to improve our model by adding e.g. Dropout, weight_inittializer or early stopping.

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6,mode='min',restore_best_weights=True, min_delta=0.0001)
plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, mode='min',verbose=1)
weight_initializer = tf.keras.initializers.glorot_uniform()
bias_init = tf.keras.initializers.Zeros()

In [None]:
def best_ann():
    
    input_lyr = Input(shape=(N_INPUTS,))
    layer_1 = Dense(100, activation='relu',kernel_initializer=weight_initializer, bias_initializer=bias_init)(input_lyr)
    #dropout_1 = Dropout(0.2)(layer_1)
    layer_norm_1 = LayerNormalization()(layer_1)
    layer_2 = Dense(100, activation='relu')(layer_norm_1)
    #dropout_2 = Dropout(0.2)(layer_2)
    layer_norm_2 = LayerNormalization()(layer_2)
    layer_3 = Dense(50, activation='relu')(layer_norm_2)
    output = Dense(1)(layer_3)
    
    model = Model(inputs=input_lyr, outputs=output)
    
    return model

In [None]:
# Instantiate best model
best_model = best_ann()
plot_model(best_model,show_shapes=True)

In [None]:
tf.random.set_seed(45)

# Compile the model
best_model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
                   loss=tf.keras.losses.mean_squared_error)

# Fit the model
hist_best_model = best_model.fit(Xtrain_enc,
                                 ytrain_scaled,
                                 batch_size=128,
                                 epochs=50,
                                 validation_data=(Xvalid_enc, yvalid_scaled),
                                 callbacks=[es,plateau])

In [None]:
hist_best_df = pd.DataFrame(hist_best_model.history)

hist_best_df.drop('lr',axis=1).plot(figsize=(10,5))

In [None]:
# Make prediction
y_pred_enc_b = best_model.predict(Xvalid_enc)
y_pred_enc_b = scaler_out.inverse_transform(y_pred_enc_b)
yval = scaler_out.inverse_transform(yvalid_scaled)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(yval, y_pred_enc_b))
print(f"Base Model RMSE= {rmse}")

Now we have nice model I will wrap it up with one of the cross validation method and take an average of it as it usually helps to improve predictions(with different split we can achieve better results).

## K-fold Cross Validation

In [None]:
def plot_history(d):
    df = pd.DataFrame(d)
    df.drop('lr', axis=1).plot(figsize=(10,5))
    plt.show()

In [None]:
N_FOLDS = 10
SEED = 45

In [None]:
rmse_folds = []
test_sub = np.zeros((len(test_df),1))

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

train_df['kfold'] = -1

for fold, (tr_idx, val_idx) in enumerate(skf.split(X=train_df, y=train_df['loss'])):
    train_df.loc[val_idx,'kfold'] = fold


for fold in range(N_FOLDS):
    
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,mode='min',restore_best_weights=True, min_delta=0.0001)
    plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, mode='min',verbose=1)
    weight_initializer = tf.keras.initializers.glorot_uniform()
    bias_init = tf.keras.initializers.Zeros()

    
    print('==================')
    print(f"TRAINING FOLD={fold+1}")
    print('==================')
    train = train_df[train_df['kfold'] != fold].reset_index(drop=True)
    valid = train_df[train_df['kfold'] == fold].reset_index(drop=True)
    xtrain = train.drop(['kfold', 'loss'], axis=1)
    xvalid = valid.drop(['kfold','loss'], axis=1)
    
    ytrain = np.array(train['loss']).reshape((len(xtrain),1))
    yvalid = np.array(valid['loss']).reshape((len(xvalid), 1))
    
    # Instantiate our scaler for input data and target
    scaler_in = StandardScaler()
    scaler_out = StandardScaler()
    # Scale input dataset
    xtrain_scaled = scaler_in.fit_transform(xtrain)
    xvalid_scaled = scaler_in.transform(xvalid)
    test_scaled = scaler_in.transform(test_df)
    # Scale output target
    ytrain_scaled = scaler_out.fit_transform(ytrain)
    yvalid_scaled = scaler_out.transform(yvalid)
    
    # Encode our datasets
    xtrain_enc = encoder.predict(xtrain_scaled)
    xvalid_enc = encoder.predict(xvalid_scaled)
    test_enc = encoder.predict(test_scaled)
    
    # Instantiate and fit ANN model
    best_model_cv = best_ann()
    best_model_cv.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
                          loss=tf.keras.losses.mean_squared_error)
    
    history = best_model_cv.fit(xtrain_enc,
                                ytrain_scaled,
                                epochs=30,
                                batch_size=128,
                                validation_data=(xvalid_enc, yvalid_scaled),
                                callbacks=[es, plateau])
    
    
    # Evaluate the model
    y_pred = best_model_cv.predict(xvalid_enc)
    y_pred_sub = best_model_cv.predict(test_enc)
    # Calculate rmse
    y_pred = scaler_out.inverse_transform(y_pred)
    y_true = scaler_out.inverse_transform(yvalid_scaled)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    rmse_folds.append(rmse)
    print('==============================')
    print(f"FOLD={fold+1}, RMSE={rmse}")
    print('==============================')
    
    # Plot history
    plot_history(history.history)
    
    # Add predictions on test dataset for submission
    test_sub += y_pred_sub
    
print('===============================================')
print(f'AVERAGE RMSE AFTER {N_FOLDS} FOLDS = {np.average(rmse_folds)}')
print('===============================================')

## Time for first submission 

In [None]:
test_sub = test_sub / N_FOLDS
test_sub = scaler_out.inverse_transform(test_sub)
sub_df = sub_sample_df.copy()
sub_df['loss'] = test_sub
sub_df.to_csv('first_encoder_sub_10_folds.csv', index=False)

In [None]:
tf.keras.backend.clear_session()

## Encoder and CNN ensemble for tabular data

In [None]:
def conv1D_model():
    
    # Create a  model
    input_L = Input(shape=((N_INPUTS,1)))
    conv1 = Conv1D(64, 2, activation='relu')(input_L) 
    conv2 = Conv1D(128, 2, activation='relu')(conv1)
    maxpool1 = MaxPooling1D(pool_size=3, strides=1, padding='same')(conv2)
    flatten = Flatten()(maxpool1)
    x = Dense(50, activation='relu')(flatten)
    x = Dense(25, activation='relu')(x)
    output = Dense(1)(x)
    
    model = Model(inputs=input_L, outputs=output)
    
    return model

In [None]:
cnn_rmse_folds = []
ann_rmse_folds = []
test_sub_ann = np.zeros((len(test_df),1))
test_sub_cnn = np.zeros((len(test_df),1))

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

train_df['kfold'] = -1

for fold, (tr_idx, val_idx) in enumerate(skf.split(X=train_df, y=train_df['loss'])):
    train_df.loc[val_idx,'kfold'] = fold


for fold in range(N_FOLDS):
    
    # Split the data
    train = train_df[train_df['kfold'] != fold].reset_index(drop=True)
    valid = train_df[train_df['kfold'] == fold].reset_index(drop=True)
    xtrain = train.drop(['kfold', 'loss'], axis=1)
    xvalid = valid.drop(['kfold','loss'], axis=1)
    
    ytrain = np.array(train['loss']).reshape((len(xtrain),1))
    yvalid = np.array(valid['loss']).reshape((len(xvalid), 1))
    
    # Instantiate our scaler for input data and target
    scaler_in = StandardScaler()
    scaler_out = StandardScaler()
    # Scale input dataset
    xtrain_scaled = scaler_in.fit_transform(xtrain)
    xvalid_scaled = scaler_in.transform(xvalid)
    test_scaled = scaler_in.transform(test_df)
    # Scale output target
    ytrain_scaled = scaler_out.fit_transform(ytrain)
    yvalid_scaled = scaler_out.transform(yvalid)
    
    print('======================================')
    print(f'TRAINING AUTOENCODER IN FOLD={fold+1}')
    print('======================================')
    # instantiate autoencoder
    autoenc_m, visible, bottleneck = create_autoencoder()
    # Fit the autoencoder model to reconstruct inputs
    autoenc_m.fit(xtrain_scaled, 
                  xtrain_scaled, 
                  epochs=15, 
                  verbose=2, 
                  validation_data=(xvalid_scaled, xvalid_scaled),
                  callbacks=[plateau])
    
    # Define an encoder model (without the decoder)
    encoder = Model(inputs=visible, outputs=bottleneck)
    
    # Encode our datasets
    xtrain_enc = encoder.predict(xtrain_scaled)
    xvalid_enc = encoder.predict(xvalid_scaled)
    test_enc = encoder.predict(test_scaled)
    
    print('===========================')
    print(f"TRAINING ANN MODEL FOLD={fold+1}")
    print('===========================')
    
    #===================== ANN Model ============================#
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6,mode='min',restore_best_weights=True, min_delta=0.0001)
    plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, mode='min',verbose=1)
    weight_initializer = tf.keras.initializers.glorot_uniform()
    bias_init = tf.keras.initializers.Zeros()
    
    # Instantiate and fit ANN model
    best_model_cv = best_ann()
    best_model_cv.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
                          loss=tf.keras.losses.mean_squared_error)
    
    history_ann = best_model_cv.fit(xtrain_enc,
                                    ytrain_scaled,
                                    epochs=30,
                                    batch_size=128,
                                    validation_data=(xvalid_enc, yvalid_scaled),
                                    callbacks=[es, plateau])

    # Evaluate the ANN model
    y_pred_ann = best_model_cv.predict(xvalid_enc)
    y_pred_sub_ann = best_model_cv.predict(test_enc)
    # Calculate rmse
    y_pred_ann = scaler_out.inverse_transform(y_pred_ann)
    y_true = scaler_out.inverse_transform(yvalid_scaled)
    ann_rmse = np.sqrt(mean_squared_error(y_true, y_pred_ann))
    ann_rmse_folds.append(ann_rmse)
    
    print('===========================')
    print(f"FOLD={fold+1}, RMSE={ann_rmse}")
    print('===========================')
    
    # Plot history
    plot_history(history_ann.history)
    
    test_sub_ann += y_pred_sub_ann
    
    #====================== CNN Model ===========================#
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4,mode='min',restore_best_weights=True, min_delta=0.0001)
    plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=2, mode='min',verbose=1)

    print('=============================')
    print(f"TRAINING CNN MODEL FOLD={fold+1}")
    print('=============================')
    # Prepare shape of input data for CNN model
    xtr_conv = xtrain_scaled.reshape((xtrain_scaled.shape[0],xtrain_scaled.shape[1], 1))
    xval_conv = xvalid_scaled.reshape((xvalid_scaled.shape[0],xvalid_scaled.shape[1], 1))
    test_conv = test_scaled.reshape((test_scaled.shape[0],test_scaled.shape[1], 1))
    
    # Instantiate and fit Conv1D model
    conv_model = conv1D_model()
    conv_model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
                       loss=tf.keras.losses.mean_squared_error)
    
    
    history_cnn = conv_model.fit(xtr_conv,
                                 ytrain_scaled,
                                 epochs=30,
                                 batch_size=128,
                                 validation_data=(xval_conv, yvalid_scaled),
                                 callbacks=[es, plateau])
    
    # Evaluate the CNN model
    y_pred_cnn = conv_model.predict(xval_conv)
    y_pred_sub_cnn = conv_model.predict(test_conv)
    # Calculate rmse
    y_pred_cnn = scaler_out.inverse_transform(y_pred_cnn)
    y_true = scaler_out.inverse_transform(yvalid_scaled)
    cnn_rmse = np.sqrt(mean_squared_error(y_true, y_pred_cnn))
    cnn_rmse_folds.append(cnn_rmse)
    
    
    print('==============================')
    print(f"FOLD={fold+1}, RMSE={cnn_rmse}")
    print('==============================')
    
    # Plot henc
    plot_history(history_cnn.history)
    
    # Add predictions on test dataset for submission
    test_sub_cnn += y_pred_sub_cnn
    
print('===============================================')
print(f'AVERAGE RMSE FOR ANN MODEL AFTER {N_FOLDS} FOLDS = {np.average(ann_rmse_folds)}')
print('===============================================')
print('===============================================')
print(f'AVERAGE RMSE FOR CNN MODEL AFTER {N_FOLDS} FOLDS = {np.average(cnn_rmse_folds)}')
print('===============================================')

In [None]:
test_sub = (test_sub_ann / N_FOLDS) + (test_sub_cnn / N_FOLDS) / 2
test_sub = scaler_out.inverse_transform(test_sub)
sub_df = sub_sample_df.copy()
sub_df['loss'] = test_sub
sub_df.to_csv('encoder_ann_cnn_sub_10_folds_sc_best.csv', index=False)

In [None]:
tf.keras.backend.clear_session()

# **Don't forget to upvote 👍 if you like it. Big Thanks 💗**