# A Simple MLP

In [1]:
# mlp for multi-output regression
import time
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import *
from keras import Input
from keras.callbacks import CSVLogger, EarlyStopping

In [2]:
# df = pd.read_csv('./data/train.csv')
# print("DataFrame Shape: {} rows, {} columns".format(*df.shape))
# display(df.head())

xls_train = pd.ExcelFile('./data/train.xlsx')
xls_test = pd.ExcelFile('./data/test.xlsx')

features_considered = ['t','1', '2', '3', '4']
outputs_considered = ['x2', 'y2']

In [3]:
def norm_data(data):
    mean = data.mean(axis=0)
    std = data.std(axis=0)
    data = (data - mean)/std
    
    return data, mean, std

def load_data(xls):
    df = pd.read_excel(xls, sheet_name=None)
    print("Read {} sheets from excel file".format(len(df)))
    for sheet in df.values():
        sheet.columns = ['t','1','2','3','4','time','x','y']
        sheet['x2'] = sheet['x'] - sheet['x'][0]
        sheet['y2'] = sheet['y'] - sheet['y'][0]
        
    df = pd.concat(df.values(), ignore_index=False)
    print("DataFrame Shape: {} rows, {} columns".format(*df.shape))
    df.columns = ['t','1','2','3','4','time','x','y','x2','y2']
    display(df.head())
    
    features = df[features_considered]
    features.index = df['t']
    outputs = df[outputs_considered]
    outputs.index = df['t']
    
    X, mean, std = norm_data(features.values)
    y, mean, std = norm_data(outputs.values)
    X = features.values
    y = outputs.values
    y = y - y[0]
    
    print(X.shape, y.shape, mean.shape, std.shape)
    return X, y, mean, std

In [4]:
# We use "lr_schedule" to see which "learning rate" is optimum 
# Run the model with less epoch to visualize "learning rate" vs "loss"
# lr_schedule = tf.keras.callbacks.LearningRateScheduler(
#                     lambda epoch: 1e-8 * 10**(epoch/20))
# Optimizer and loos parameters
# loss = tf.keras.losses.Huber()
# optimizer = tf.keras.optimizers.SGD(lr=1e-8, momentum=0.9)
# optimizer = 'adam'

# get the model
def get_model(n_inputs, n_outputs):
    model = Sequential()
    model.add(Input(shape=(n_inputs,)))
    model.add(BatchNormalization(name = 'batch_norm_0'))
    model.add(Dense(32, name = 'dense_1', kernel_initializer='he_uniform', activation='relu'))
    model.add(Dropout(0.30, name = 'dropout_1'))
    model.add(BatchNormalization(name = 'batch_norm_1'))
    model.add(Dense(128, name = 'dense_2', kernel_initializer='he_uniform', activation='relu'))
    model.add(Dropout(0.20, name = 'dropout_2'))
    model.add(BatchNormalization(name = 'batch_norm_2'))
    model.add(Dense(256, name = 'dense_3', kernel_initializer='he_uniform', activation='relu'))
    model.add(Dropout(0.20, name = 'dropout_3'))
    model.add(BatchNormalization(name = 'batch_norm_3'))
    model.add(Dense(64, name = 'dense_4', kernel_initializer='he_uniform', activation='relu'))
    model.add(Dropout(0.20, name = 'dropout_4'))
    model.add(BatchNormalization(name = 'batch_norm_4'))
    model.add(Dense(16, name = 'dense_5', kernel_initializer='he_uniform', activation='relu'))
    model.add(BatchNormalization(name = 'batch_norm_5'))
    model.add(Dense(n_outputs, name = 'dense_6'))
    
    # model.compile(loss='mae', optimizer='adam')
    model.compile(loss = "mae", optimizer='adam', metrics = ['mae', 'mse'])
    
    return model

In [5]:
EPOCHS = 4
BATCH_SIZE = 1024
BUFFER_SIZE = 2048

# load train dataset
X_train, y_train, mean_train, std_train = load_data(xls_train)
# train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train))
# train_data = train_data.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# load test dataset
X_test, y_test, mean_test, std_test = load_data(xls_test)
# test_data = tf.data.Dataset.from_tensor_slices((X_test, y_test))
# test_data = test_data.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

n_inputs, n_outputs = X_train.shape[1], y_train.shape[1]

# define model
model = get_model(n_inputs, n_outputs)
print(model.summary())

# evaluate model
results = list()
# define evaluation procedure
rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# enumerate folds
for train_ix, test_ix in rkf.split(X_train):
    print("TRAIN:", train_ix.shape, "TEST:", test_ix.shape)
    print("TRAIN:", type(train_ix), "TEST:", type(test_ix))
    ix_range = X_train.shape[0]
    train_ix = train_ix[train_ix < ix_range]
    test_ix = test_ix[test_ix < ix_range]
    print("Filtered: TRAIN:", train_ix.shape, "TEST:", test_ix.shape)
    # prepare data
    X_train, X_test = X_train[train_ix], X_train[test_ix]
    y_train, y_test = y_train[train_ix], y_train[test_ix]
    
    # fit model
    model.fit(X_train, y_train, verbose=1, epochs=EPOCHS)
    # history = model.fit(X_train, y_train, verbose=0, epochs=10, callbacks=[lr_schedule])
    # evaluate model on test set
    mae = model.evaluate(X_test, y_test, verbose=1)
    # store result
    # print('iteration %d: MAE = %.3f' %(len(results), mae))

    results.append(mae)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(results), std(results)))   

# Train
# modelstart = time.time()
# print("\nTRAIN MODEL...")
# history = model.fit(train_data, epochs=EPOCHS, validation_data=test_data, verbose=1)
# model.save('mlp_full_train.h5')
# print("\nModel Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))

Read 38 sheets from excel file
DataFrame Shape: 741213 rows, 10 columns


Unnamed: 0,t,1,2,3,4,time,x,y,x2,y2
0,0.0,119.947865,102.751992,100.623908,101.0,0.0,15.86038,-23.29752,0.0,0.0
1,0.01,119.927054,102.738962,100.628316,101.0,0.01,15.860189,-23.298586,-0.000191,-0.001066
2,0.02,119.906244,102.725932,100.632724,101.0,0.02,15.859999,-23.299651,-0.000381,-0.002131
3,0.03,119.885434,102.712902,100.637131,101.0,0.03,15.859808,-23.300717,-0.000572,-0.003197
4,0.04,119.864623,102.699872,100.641539,101.0,0.04,15.859618,-23.301783,-0.000762,-0.004263


(741213, 5) (741213, 2) (2,) (2,)
Read 6 sheets from excel file
DataFrame Shape: 129748 rows, 10 columns


Unnamed: 0,t,1,2,3,4,time,x,y,x2,y2
0,0.0,152.802487,107.029396,95.510399,101.0,0.0,15.88179,-23.33263,0.0,0.0
1,0.01,152.849741,107.434616,95.491606,101.0,0.01,15.881185,-23.334435,-0.000605,-0.001805
2,0.02,152.896996,107.839836,95.472812,101.0,0.02,15.880579,-23.336241,-0.001211,-0.003611
3,0.03,152.94425,108.245055,95.454018,101.0,0.03,15.879974,-23.338046,-0.001816,-0.005416
4,0.04,152.991505,108.650275,95.435225,101.0,0.04,15.879368,-23.339852,-0.002422,-0.007222


(129748, 5) (129748, 2) (2,) (2,)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_norm_0 (BatchNormaliza (None, 5)                 20        
_________________________________________________________________
dense_1 (Dense)              (None, 32)                192       
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
batch_norm_1 (BatchNormaliza (None, 32)                128       
_________________________________________________________________
dense_2 (Dense)              (None, 128)               4224      
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
batch_norm_2 (BatchNor

In [6]:
def plot_test_results(model, X_test, y_test, mean, std):
    y_pred = model.predict(X_test)
    # y_pred = y_pred*std + mean
    print(type(y_pred), y_pred.shape)
    # print('Predicted: %s' % y_pred[:2])
    # print('Real Value: %s' % y_test[:2])

    plt.figure(figsize=(18, 6))
    plt.plot(outputs_test.index, np.array(y_pred[:,0]), 'r', label='predicted x')
    plt.plot(outputs_test.index, np.array(y_pred[:,1]), 'b', label='predicted y')
    plt.plot(outputs_test.index, np.array(y_test[:,0]), 'g', label='real x')
    plt.plot(outputs_test.index, np.array(y_test[:,1]), 'c', label='real y')
    plt.legend(loc='upper left')
    plt.show()
    
    return

In [7]:
# Load test data
df_test = pd.read_excel(xls_train, sheet_name=None)
print("Read {} sheets from excel file".format(len(df_test)))

for sheet in df_test.values():
    print("DataFrame Shape: {} rows, {} columns".format(*sheet.shape))
    sheet.columns = ['t','1', '2', '3', '4','time','x', 'y']

    sheet['x2'] = sheet['x'] - sheet['x'][0]
    sheet['y2'] = sheet['y'] - sheet['y'][0]
    
    # df = pd.concat(df.values(), ignore_index=False)
    
    features_test = sheet[features_considered]
    features_test.index = sheet['t']
    outputs_test = sheet[outputs_considered]
    outputs_test.index = sheet['t']

    X_test = features_test.values
    y_test = outputs_test.values
    
    plot_test_results(model, X_test, y_test, mean_train, std_train)

Read 38 sheets from excel file


NameError: name 'sheet_test' is not defined