In [None]:
import numpy as np

import dense_layer as dl
import dl_utils as dutils
import importlib

import matplotlib.pyplot as plt

# Reload modules for updates
importlib.reload(dl)
importlib.reload(dutils)

# Simple Multi-linear Regression using NN
Neural Networks' **representation capacity** is one of the key properties behind their great success *(Besides the optimizability of their underlying computational operations and the availability of great amounts of data etc..)*. 
But in order to go step-by-step, a fully-connected neural network reduced to one node-layer (perceptron) and emiting any non-linearity is equivalent to a linear regression model.  
Thus, in this initial example, we define an arbitrary linear function $y = f_{lin}(X)$ where $X=\{x1,x2,x3,x4\} \in R^4$ and $y \in R$ by defining a set of coefficients $c_i, \ i \in [1:4]$ and an intercept $c_0$.

$f_{lin} : R^4 -> R$ 

$\ \ \ \ \ \ \ \ X -> c_0 + \Sigma_{i=1}^4c_ix_i$

We randomly generate a set of $m$ observations following $f_{lin}(X) | X$ joint distrubution. 
This simple notebook demonstrates how we can fit such continuous function *(=> regression problem)* using a simple Perceptron (Neural network with one layer & node), incorporating a multi-linear regression *(with an input of 4 and a non-null intercept in this case)*.

In [None]:
# Data size (Number of observations)
m = 4096

# Input features (Independent variables)
x1 = np.random.uniform(low=-10, high=20, size=[m])
x2 = np.random.uniform(low=0, high=10, size=[m])
x3 = np.random.normal(loc=5, scale=10, size=[m])
x4 = np.random.normal(loc=-1, scale=3, size=[m])

# Coefs & intercept
coefs_lin = np.array([2, -0.5, 3.5, -1.1])
intercept_lin = 6.8


def f_lin(x_1, x_2, x_3, x_4):
    return coefs_lin[0] * x_1 + coefs_lin[1] * x_2 + coefs_lin[2] * x_3 + coefs_lin[3] * x_4 + intercept_lin


# Label (y=f(x) + noise) (Dependent variable)
y = f_lin(x1, x2, x3, x4)
y += np.random.randn(m) * np.std(y)/10
X = np.array([x1, x2, x3, x4]).transpose()

In [None]:
# Train-Test splitting
X_train, X_test, y_train, y_test = dutils.train_test_split(
    X, y, test_size=0.25)

In [None]:
# Multi-Linear regression model (1 Layer only (Output layer) + Indentity activation)
MultiLinRegNN = dl.Sequential(
    [dl.Dense(out_size=1, in_size=4, activation="identity")])

In [None]:
# Compile model
MultiLinRegNN.compile(normalize=True)

In [None]:
# Weights ~ Coefficients
# Bias ~ Intercept
print(f"Weights before :\n {MultiLinRegNN._layers[0]._W}")
print(f"Bias before : {MultiLinRegNN._layers[0]._b}")

In [None]:
# Evaluate model's accuracy (using loss function. TODO : Metric can be different)
MultiLinRegNN.evaluate(X_data=X_test, y_label=y_test)

In [None]:
# Train Model & return its loss function training history log
lossHist = MultiLinRegNN.train(X_data=X_train, y_label=y_train,
                               minibatch=64, shuffle=True,
                               n_iterations=100, lr=0.02, min_loss=0.01)

In [None]:
# Evaluate model's accuracy after training (on training data)
MultiLinRegNN.evaluate(X_data=X_train, y_label=y_train)

In [None]:
# Evaluate model's accuracy after training (on test data)
MultiLinRegNN.evaluate(X_data=X_test, y_label=y_test)

In [None]:
# Model's weights after training
MultiLinRegNN._layers[0]._W

In [None]:
# Scaling model's weights to account for the internal data std normalization
# applied during training
scaled_W = np.divide(MultiLinRegNN._layers[0]._W.reshape(
    1, -1), MultiLinRegNN._in_features_std)
scaled_W

In [None]:
# Compare learned weights against true coefficients :
np.abs(coefs_lin - scaled_W) / (np.abs(coefs_lin + scaled_W)/2)

In [None]:
# Model's bias after training
MultiLinRegNN._layers[0]._b

In [None]:
# Scaling & shifting model's bias to account for the internal data std and mean
# normalizations applied during training
scaled_b = MultiLinRegNN._layers[0]._b - \
    np.sum(np.multiply(scaled_W, MultiLinRegNN._in_features_mean))
scaled_b = scaled_b[0]
scaled_b

In [None]:
# Compare learned bias against true intercept :
np.abs(intercept_lin - scaled_b) / (np.abs(intercept_lin + scaled_b)/2)

In [None]:
# Gradient descent loss function evolution during training
plt.scatter(x=np.arange(0, lossHist.shape[0]), y=lossHist, c="black")