# Machine Learning regression model - Version 1.0

!!!! Requries x64 !!!!

Required libraries:
- Requires the latest pip
-> pip install --upgrade pip
- Current stable release for CPU and GPU
-> pip install tensorflow
- Use seaborn for pairplot
-> pip install -q seaborn
- Pandas library: pip install pandas
- Use some functions from tensorflow_docs
-> pip install -q git+https://github.com/tensorflow/docs
- Numpy (if not already installed) 
-> pip install numpy
- Matplotlib (if not already installed)
-> pip install matplotlib

In [1]:
# Imports
# -----------

# required libraries
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# seaborn
import seaborn as sns
# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# tensorflow_docs
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [33]:
# Load dataset
# -----------------
DATASET_PATH = 'D:\Igor\Research_USF\Python\ml\concatenated_data.pickle'
# read pickle data and save as a pd dataset
raw_dataset = pd.read_pickle(DATASET_PATH)
# do not modify raw_data, instead, copy its instance
dataset = raw_dataset.copy()
# show some content
dataset.tail()

Unnamed: 0,a,b,time,x_1,y_1,z_1,x_2,y_2,z_2,x_3,...,uz_7,ux_8,uy_8,uz_8,sx,sy,sz,sxy,sxz,syz
2016680,1.910112,19.899471,0.181091,-0.5,-0.5,0.0,-0.5,-0.5,1.04309,-0.5,...,0.0,-0.04438,0.003211,0.043092,0.000466,0.000181,0.000444,2.35209e-11,-6.47365e-12,6.93366e-12
2016681,1.910112,19.899471,0.186464,-0.5,-0.5,0.0,-0.5,-0.5,1.0441,-0.5,...,0.0,-0.045413,0.003327,0.044099,-0.000352,-0.000142,-0.00034,-2.30449e-11,6.35792e-12,-6.80238e-12
2016682,1.910112,19.899471,0.191838,-0.5,-0.5,0.0,-0.5,-0.5,1.0451,-0.5,...,0.0,-0.04638,0.00338,0.045103,0.000442,0.000173,0.000424,2.24871e-11,-6.22877e-12,6.65226e-12
2016683,1.910112,19.899471,0.197461,-0.5,-0.5,0.0,-0.5,-0.5,1.04613,-0.5,...,0.0,-0.047426,0.003498,0.046127,-0.000337,-0.000136,-0.000326,-2.21435e-11,6.1494e-12,-6.56029e-12
2016684,1.910112,19.899471,0.2,-0.5,-0.5,0.0,-0.5,-0.5,1.0466,-0.5,...,0.0,-0.047836,0.003475,0.046601,0.001258,0.000501,0.001214,4.55039e-11,-1.27021e-11,1.3529e-11


In [34]:
# Check if there are unknown values

dataset.isna().sum()
dataset = dataset.dropna()

In [35]:
# Split data into train and test datasets
# Here, we are spliting 80% of the data for training and the rest for testing
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [None]:
# Inpect data
sns.pairplot(train_dataset[['a','b','time']], diag_kind="kde")

In [36]:
# Set input params and data fetures
INPUT_PARAMS = ['a','b','time']
features = [v for v in list(dataset.columns.values) if v not in INPUT_PARAMS]

In [37]:
# Obtain data statistics
train_stats = train_dataset.describe()
train_stats = train_stats.drop(features, axis=1)
train_stats = train_stats.transpose()
train_stats


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
a,1613348.0,8.814532,5.979803,1.0,2.820225,8.137566,14.068783,20.0
b,1613348.0,6.436937,3.951849,1.0,3.325843,5.853933,8.483146,20.0
time,1613348.0,0.075484,0.058789,0.001,0.023116,0.060393,0.122796,0.2


In [38]:
# split inputs and features
train_labels = train_dataset.drop(INPUT_PARAMS, axis=1)     #outputs
train_dataset = train_dataset.drop(features, axis=1)        #inputs

test_labels = test_dataset.drop(INPUT_PARAMS, axis=1)
test_labels = test_labels.drop(features, axis=1)

In [39]:
print("train labels tail:")
train_labels.tail()

train labels tail:


Unnamed: 0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,...,uz_7,ux_8,uy_8,uz_8,sx,sy,sz,sxy,sxz,syz
1148901,-0.5,-0.5,0.0,-0.5,-0.5,1.16438,-0.5,0.391533,0.0,-0.5,...,0.0,-0.036726,-0.10857,0.164566,-0.289869,-0.427035,0.063522,-1.32816e-05,-1.15712e-05,3.82192e-07
1759682,-0.5,-0.5,0.0,-0.5,-0.5,1.00362,-0.5,0.500255,0.0,-0.5,...,0.0,-0.003861,0.000255,0.003619,-0.001253,-0.001206,-0.001218,-3.1102900000000004e-17,2.84915e-17,4.0888900000000004e-17
1751402,-0.5,-0.5,0.0,-0.5,-0.5,1.05742,-0.5,0.504484,0.0,-0.5,...,0.0,-0.058524,0.004484,0.057419,0.000358,0.00034,0.000407,2.40858e-10,-1.65712e-11,4.06608e-11
402755,-0.5,-0.5,0.0,-0.5,-0.5,1.06086,-0.5,0.504831,0.0,-0.5,...,0.0,-0.061904,0.004831,0.060865,0.001881,0.002198,0.001959,-2.83316e-11,1.37586e-12,-4.53742e-12
850476,-0.5,-0.5,0.0,-0.5,-0.5,1.03446,-0.5,0.502517,0.0,-0.5,...,0.0,-0.035734,0.002517,0.034455,-9.6e-05,-0.000273,-9.9e-05,3.42436e-15,-9.42739e-16,1.0034e-15


In [40]:
print("train dataset tail:")
train_dataset.tail()

train dataset tail:


Unnamed: 0,a,b,time
1148901,11.455026,6.764045,0.022331
1759682,19.597884,4.235955,0.008664
1751402,19.497354,5.449438,0.150115
402755,1.301587,4.134831,0.181303
850476,8.640212,9.898876,0.07371


In [41]:
# normalize data
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

# normed_train_data = train_dataset
# normed_test_data = test_dataset

normed_train_data.tail()

Unnamed: 0,a,b,time
1148901,0.441569,0.082773,-0.904128
1759682,1.803295,-0.55695,-1.136608
1751402,1.786484,-0.249883,1.269455
402755,-1.256387,-0.582539,1.799968
850476,-0.029152,0.87603,-0.030178


In [42]:
# build model

def build_model():
    model = keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=[len(INPUT_PARAMS)]),
    layers.Dense(32, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(features), activation='linear')
    ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)
    lossFunction = tf.keras.losses.MeanSquaredError()
    metrics=['mean_squared_error']

    # loss functions already trie:
        # tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        # tf.keras.losses.MeanAbsoluteError(reduction=tf.keras.losses.Reduction.SUM)

    # model.compile(
    #             loss='mse',
    #             optimizer=optimizer,
    #             metrics=['mae', 'mse'])
    model.compile(
            loss=lossFunction,
            optimizer=optimizer,
            metrics=metrics)
    return model

model = build_model()

In [43]:
# inspect model
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 16)                64        
_________________________________________________________________
dense_29 (Dense)             (None, 32)                544       
_________________________________________________________________
dense_30 (Dense)             (None, 64)                2112      
_________________________________________________________________
dense_31 (Dense)             (None, 54)                3510      
Total params: 6,230
Trainable params: 6,230
Non-trainable params: 0
_________________________________________________________________


In [44]:
# try out model
normed_train_data.tail()
example_batch = normed_train_data[-2:]

example_result = model.predict(example_batch)
example_result

array([[ 2.34042183e-01, -6.74772784e-02, -2.34096080e-01,
        -6.83188289e-02, -3.84668887e-01, -2.82160908e-01,
        -1.14328414e-01, -3.33077908e-01,  1.72905087e-01,
        -1.73628964e-02,  1.15232609e-01, -3.35521437e-02,
        -2.51486838e-01,  2.13168621e-01,  2.39614710e-01,
        -5.27083635e-01, -3.12764615e-01, -1.17980622e-01,
         1.29862517e-01, -2.03018039e-02, -4.33978252e-03,
         1.14788145e-01, -1.95956603e-02, -2.61153460e-01,
        -4.38284427e-02, -3.47724259e-01, -3.75586927e-01,
        -6.99349344e-02,  3.36729884e-01, -1.02715015e-01,
        -1.89245284e-01,  1.00331008e-03, -3.39205295e-01,
        -2.85658628e-01,  8.52979645e-02, -5.98379932e-02,
         8.26591477e-02, -1.27170160e-01, -1.15422815e-01,
        -8.98819193e-02, -6.88349456e-02, -4.89543974e-02,
        -1.63714588e-01, -1.30937412e-01,  7.35426098e-02,
        -6.71119392e-02,  2.04966396e-01, -2.08686054e-01,
        -1.58935487e-01, -5.36282957e-02,  3.33192766e-0

In [45]:
# train model based on epochs
EPOCHS = 1000

# # The patience parameter is the amount of epochs to check for improvement
# early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# early_history = model.fit(normed_train_data, train_labels, 
#                     epochs=EPOCHS, validation_split = 0.2, verbose=0, 
#                     callbacks=[early_stop, tfdocs.modeling.EpochDots()])

history = model.fit(
  normed_train_data, 
  train_labels,
  epochs=EPOCHS, 
  validation_split = 0.2, 
  verbose=0,
  callbacks=[tfdocs.modeling.EpochDots()]
  )


Epoch: 0, loss:nan,  mean_squared_error:nan,  val_loss:nan,  val_mean_squared_error:nan,  
.

KeyboardInterrupt: 

In [23]:
# visualize history

hist = pd.DataFrame(early_history.history)
hist['epoch'] = early_history.epoch
hist.tail()

plotter = tfdocs.plots.HistoryPlotter(smoothing_std=2)
plotter.plot({'Early Stopping': early_history}, metric = "mae")
plt.ylim([0, 10])
plt.ylabel('MAE [MPG]')

ValueError: x and y must have same first dimension, but have shapes (10,) and (17,)

In [None]:
# evaluate model based on testing data

loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} MPG".format(mae))