# Tensorflow MLP
Here we will attempt to design a regression model for the data. First we will get the data from the new file then select which to use. The feature vector will be {b1, a2, frac, RatioTotalArea} and the target will be {eta c}. 

There are currently 4 options:
- entire unscaled dataset: 57841
- entire scaled dataset (from unscaled): 57841
- limited dataset: 42016
- limited and scaled dataset: 42016

I will use the limited and scaled dataset to start because I can randomise easily. The data is scaled using the MaxAbsScaler to start.

### I do think that we may be loosing data on the 2nd ellipse, so another feature may be necessary. (The info for the other dimension is there, but in the 'RatioTotalArea').

## Model Thoughts
- Regression
- Output activation = Linear, Hidden Layers= ReLu because +ve inputs and outputs
- Loss: 'mean_squared_error' / 'mean_squared_logarithmic_error' = does not pinalise large values as much / 'mean_absolute_error' - more robust to outliers
- Regularizers: Penalization in cost function to prevent overfitting. There is l1 and l2 + more. 
- Optimizers:
- Dropout: Prevent overfitting. Maybe use rate of 0.2?

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MaxAbsScaler
import seaborn as sns
sns.set()

seed_val = 2000

In [None]:
# def split_data(dataset, seed, train_ratio=0.6, test_ratio=0.2, shuffle=True):
#     if shuffle:
#         dataset = dataset.sample(frac=1, random_state=seed).reset_index(drop=True)
#     test_train_ratio = test_ratio/(1-train_ratio)

#     train_dataset = dataset.sample(frac=train_ratio, random_state=0)
#     valid_and_test_dataset = dataset.drop(train_dataset.index)

#     test_dataset = valid_and_test_dataset.sample(frac=test_train_ratio, random_state=0)
#     validation_dataset = valid_and_test_dataset.drop(test_dataset.index)
    
#     return train_dataset, test_dataset, validation_dataset

def split_data(dataset, seed, train_ratio=0.6, shuffle=True):
    if shuffle:
        dataset = dataset.sample(frac=1, random_state=seed)#.reset_index(drop=True)

    train_dataset = dataset.sample(frac=train_ratio, random_state=0)
    test_dataset = dataset.drop(train_dataset.index)
    
    return train_dataset, test_dataset

def add_bias(data):
    N1 = np.shape(data)[0]
    N2 = np.shape(data)[1]
    a = -1*np.ones((N1,N2+1))
    a[:,:-1] = data
    return a

In [None]:
name = "LIM_scaled.csv"
name = "data.csv"

dataset = pd.read_csv(name)
# data2.describe().transpose()
dataset.pop("Unnamed: 0")
dataset.describe().transpose()

Scale the input features

In [None]:
scaled_dataset = dataset.copy()

scaled_dataset['b1'] = MaxAbsScaler().fit_transform(dataset['b1'].values.reshape(-1,1))
scaled_dataset['a2'] = MaxAbsScaler().fit_transform(dataset['a2'].values.reshape(-1,1))
scaled_dataset['RatioTotalArea'] = MaxAbsScaler().fit_transform(dataset['RatioTotalArea'].values.reshape(-1,1))
scaled_dataset['frac'] = MaxAbsScaler().fit_transform(dataset['frac'].values.reshape(-1,1))

In [None]:
scaled_dataset.describe().transpose()

In [None]:
train_dataset, test_dataset = split_data(scaled_dataset.copy(), seed_val, train_ratio=0.7)

sorted_train = train_dataset.sort_index()
sorted_test = test_dataset.sort_index()

Check if the training and test data represent the data.

In [None]:
fig = plt.figure()
fig, ax = plt.subplots(ncols=2, figsize=(15,7))

ax[0].scatter(x=np.arange(len(sorted_train)), y=sorted_train['eta c'], marker='.', alpha=0.4)
ax[1].scatter(x=np.arange(len(sorted_test)), y=sorted_test['eta c'], marker='.', alpha=0.4)
ax[0].set_ylabel("eta c")
ax[0].set_title("Training Data")
ax[1].set_title("Testing Data")

In [None]:
train_features = train_dataset.to_numpy()[:,0:4]
train_labels = train_dataset.to_numpy()[:,4]

sorted_test = test_dataset.sort_index()
test_features = sorted_test.to_numpy()[:,0:4]
test_labels = sorted_test.to_numpy()[:,4]

## Code the Model
We have now scaled, shuffled and split the data. Have also checked that both the training and test set represent the output space. We can now move onto coding a model.

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
train_features = train_dataset.to_numpy()[:,0:4]
train_labels = train_dataset.to_numpy()[:,4]

out_nodes = 1
in_features = 4 

X = train_features.copy()
Y = train_labels.copy()

In [None]:
# early stopping
earlystop_callback = EarlyStopping(monitor="loss", min_delta=0, patience=3, mode="min", restore_best_weights=True)

# define the mlp model
model = Sequential()

# add the hidden layers and non-linear activation functions
model.add(Dense(20, input_shape=(in_features,), activation="relu", use_bias=True))
keras.layers.Dropout(rate=0.2)
model.add(Dense(15, activation="relu", use_bias=True))
keras.layers.Dropout(rate=0.2)
# add the output layer
model.add(Dense(out_nodes, activation="linear", use_bias=True))

# optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)
model.compile(loss="mean_squared_error", optimizer='SGD', metrics=["MSE","MAE", "accuracy"])

history = model.fit(X, Y, epochs=300, batch_size=500, validation_split=0.2, callbacks=[earlystop_callback])

In [None]:
model.summary()

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
fig1 = plt.figure()
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.plot(hist['epoch'], hist['MSE'], label='Train Error')
plt.plot(hist['epoch'], hist['val_MSE'], label='Val Error')
plt.legend()
plt.title("Regression Error")

In [None]:
test_input = test_features
test_output = test_labels

prediction = model.predict(test_input)
accuracy = prediction-test_output

In [None]:
fig2 = plt.figure()
plt.plot(np.arange(len(test_output)), test_output, label="TestData")
plt.plot(np.arange(len(prediction)), prediction, label="Prediction", alpha=0.4)
# plt.plot(np.arange(len()),accuracy, label="Accuracy", alpha=0.4)
plt.legend()
plt.title("Model Prediction")