In [1]:
import torch
import tensorflow as tf
import numpy as np

tf.keras.config.set_floatx('float64')
torch.set_default_dtype(torch.float64)

In [2]:
def add_to_class(Class):  
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

# Create model

## Data
$$
\mathbf{X} \in \mathbb{R}^{M,N} \\
\mathbf{y} \in \mathbb{R}^{M}
$$

In [26]:
from sklearn.datasets import make_regression
import random

M, N, U = 100, 4, 1
TRUE_B = random.random()
X, Y, TRUE_W = make_regression(n_samples=M, n_features=N, n_targets=U, n_informative=N-1, bias=TRUE_B, noise=1, coef=True)

if U == 1: Y = Y.reshape((-1, 1))

print(X.shape)
print(Y.shape)

print(TRUE_B)
print(TRUE_W.shape)

(100, 4)
(100, 1)
0.43370068789684035
(4,)


## weights
Parameters trainables:
$$
\mathbf{w} \in \mathbb{R}^{N} \\
b \in \mathbb{R}
$$

In [27]:
class SimpleLinearRegression:
    def __init__(self, number_features : int):
        self.w = torch.randn(number_features, 1)
        self.b = torch.randn(1)

    def copyParams(self, tf_model):
        self.w.copy_(torch.tensor(tf_model.weights[0].numpy()))
        self.b.copy_(torch.tensor(tf_model.weights[1].numpy()))
        print('Done')

## weigthed sum:
$$
\mathbf{\hat{y}} \left( \mathbf{X} \right) = \mathbf{X} \mathbf{w} + b\\
\mathbf{\hat{y}} : \mathbb{R}^{M,N} \rightarrow \mathbb{R}^{M}
$$

In [28]:
@add_to_class(SimpleLinearRegression)
def predict(self, x):
    return torch.tensor(x) @ self.w + self.b

## MSE
Loss function: Mean Squared Error
$$
L\left ( \mathbf{\hat{y}} \right ) = \frac{1}{M} \sum_{i=1}^{M} \left ( \mathbf{\hat{y}}_i - \mathbf{y}_i \right )^{2} \\
L : \mathbb{R}^{M} \rightarrow \mathbb{R}
$$

Vectorized form:
$$
\begin{align}
L\left ( \mathbf{\hat{y}} \right ) &= \frac{1}{M} \left ( \mathbf{e}^{T} \mathbf{e} \right ) \\
\mathbf{e} &:= \mathbf{\hat{y}} - \mathbf{y}
\end{align}
$$

In [29]:
@add_to_class(SimpleLinearRegression)
def evaluate(self, x, y_true):
    e = self.predict(x) - torch.tensor(y_true)
    return ((e.T @ e) / e.numel()).item()

## Gradient
Gradient Descent: **Denomitator layout notation**
$$
\frac{\partial L}{\partial \mathbf{w}} = 
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}}
\frac{\partial L}{\partial \mathbf{\hat{y}}}
$$

and
$$
\frac{\partial L}{\partial b} = 
\frac{\partial \mathbf{\hat{y}}}{\partial b}
\frac{\partial L}{\partial \mathbf{\hat{y}}}
$$

where their shapes are:
$$
\frac{\partial L}{\partial \mathbf{w}} \in \mathbb{R}^{N},
\frac{\partial L}{\partial b} \in \mathbb{R},
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}} \in \mathbb{R}^{N,M}, 
\frac{\partial \mathbf{\hat{y}}}{\partial b} \in \mathbb{R}^{1,M},
\frac{\partial L}{\partial \mathbf{\hat{y}}} \in \mathbb{R}^{M}
$$

### weigthed sum derivative
$$
\begin{align}
\mathbf{\hat{y}} &= \mathbf{X} \mathbf{w} + b \\
&=
\begin{bmatrix}
\mathbf{x}_1^{T} \\ 
\mathbf{x}_2^{T} \\ 
\vdots  \\ 
\mathbf{x}_{M}^{T}
\end{bmatrix}
\mathbf{w} + b \\
&=
\begin{bmatrix}
\mathbf{x}_1^{T} \mathbf{w} + b \\ 
\mathbf{x}_2^{T} \mathbf{w} + b \\ 
\vdots  \\ 
\mathbf{x}_{M}^{T} \mathbf{w} + b
\end{bmatrix}
\end{align}
$$

where
$$
\mathbf{x}_{i}^T = \begin{bmatrix}
x_{i,1} & x_{i,2} & \cdots & x_{i,N}
\end{bmatrix}
$$

$
\frac{\partial \mathbf{\hat{y}}}{\partial b}
$

$$
\begin{align}
\frac{\partial \mathbf{\hat{y}}}{\partial b} &=
\begin{bmatrix}
\frac{\partial \hat{y}_1}{\partial b} & \frac{\partial \hat{y}_2}{\partial b} & \cdots & \frac{\partial \hat{y}_M}{\partial b} \\
\end{bmatrix} \\
&= \mathbf{1} \in \mathbb{R}^{1,M}
\end{align}
$$


$
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}}
$

$$
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}} =
\begin{bmatrix}
\frac{\partial \hat{y}_1}{\partial w_1} & \frac{\partial \hat{y}_2}{\partial w_1} & \cdots & \frac{\partial \hat{y}_M}{\partial w_1} \\
\frac{\partial \hat{y}_1}{\partial w_2} & \frac{\partial \hat{y}_2}{\partial w_2} & & \frac{\partial \hat{y}_M}{\partial w_2} \\
\vdots  &  & \ddots  & \vdots \\ 
\frac{\partial \hat{y}_1}{\partial w_N} & \frac{\partial \hat{y}_2}{\partial w_N} & \cdots  & \frac{\partial \hat{y}_M}{\partial w_N}
\end{bmatrix}
$$

wherer
$$
\begin{align}
\frac{\partial \hat{y}_i}{\partial w_j} &=
\frac{\partial }{\partial w_j}\left ( x_{i,1} w_{1} + x_{i,2} w_{2} + \cdots + x_{i,j} w_{j} + \cdots + x_{i,N} w_{N} \right ) \\
&= x_{i,j}
\end{align}
$$

then:
$$
\begin{align}
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}} &=
\begin{bmatrix}
x_{1,1} & x_{2,1} & \cdots & x_{M,1} \\
x_{1,2} & x_{2,2} & & x_{M,2} \\
\vdots  &  & \ddots  & \vdots \\ 
x_{1,N} & x_{2,N} & \cdots  & x_{M,N}
\end{bmatrix} \\
&= \mathbf{X}^T
\end{align}
$$

### MSE derivavite:
$$
\frac{\partial L}{\partial \mathbf{\hat{y}}} =
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}}
\frac{\partial L}{\partial \mathbf{e}}
$$

where:
$$
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}} \in \mathbb{R}^{M,M}, 
\frac{\partial L}{\partial \mathbf{e}} \in \mathbb{R}^{M}
$$

$
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}}
$
$$
\begin{align}
\mathbf{e} &= 
\begin{bmatrix}
\hat{y}_1 - y_1 \\ 
\hat{y}_2 - y_2 \\ 
\vdots  \\ 
\hat{y}_M - y_M
\end{bmatrix} \\

\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}} &=
\begin{bmatrix}
\frac{\partial e_1}{\partial \hat{y}_1} & \frac{\partial e_2}{\partial \hat{y}_1} & \cdots & \frac{\partial e_M}{\partial \hat{y}_1} \\
\frac{\partial e_1}{\partial \hat{y}_2} & \frac{\partial e_2}{\partial \hat{y}_2} & & \frac{\partial e_M}{\partial \hat{y}_2} \\ 
\vdots  &  & \ddots  & \vdots \\ 
\frac{\partial e_1}{\partial \hat{y}_M} & \frac{\partial e_2}{\partial \hat{y}_M} & \cdots & \frac{\partial e_M}{\partial \hat{y}_M}
\end{bmatrix} \\
&= \boldsymbol{I} \in \mathbb{R}^{M,M}
\end{align}
$$

$
\frac{\partial L}{\partial \mathbf{e}}
$
$$
\begin{align}
L &= \frac{1}{M} \left ( e_{1}^{2} + e_{2}^{2}+ \cdots + e_{M}^{2}\right ) \\
\frac{\partial L}{\partial \mathbf{e}} &= 
\begin{bmatrix}
\frac{\partial L}{\partial e_1} \\ 
\frac{\partial L}{\partial e_2} \\ 
\vdots \\ 
\frac{\partial L}{\partial e_M}
\end{bmatrix} \\
&= \frac{2}{M}
\begin{bmatrix}
e_1 \\ 
e_2 \\ 
\vdots \\ 
e_M
\end{bmatrix}
= \frac{2}{M} \mathbf{e}
\end{align}
$$

$
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}}
\frac{\partial L}{\partial \mathbf{e}}
$
$$
\frac{\partial L}{\partial \mathbf{\hat{y}}} =
\frac{2}{M} \boldsymbol{I} \mathbf{e} =
\frac{2}{M} \mathbf{e} =
\frac{2}{M} \left( \mathbf{\hat{y}} - \mathbf{y} \right)
$$

### summary

$
\frac{\partial L}{\partial \mathbf{w}}
$
$$
\frac{\partial L}{\partial \mathbf{w}} = \frac{2}{M} \mathbf{X}^{T} \left( \mathbf{\hat{y}} - \mathbf{y} \right)
$$

$
\frac{\partial L}{\partial b}
$
$$
\frac{\partial L}{\partial b} = \frac{2}{M} \mathbf{1} \left( \mathbf{\hat{y}} - \mathbf{y} \right)
$$

## Update weights

$$
\begin{align}
\mathbf{w} & \leftarrow \mathbf{w} - \alpha \nabla_{\mathbf{w}} L = \boldsymbol{w} - \alpha \left( \frac{2}{M} \mathbf{X}^{T} \left( \mathbf{\hat{y}} - \mathbf{y} \right) \right)\\
b & \leftarrow b - \alpha \nabla_{b} L = b - \alpha \left(\frac{2}{M} \mathbf{1} \left( \mathbf{\hat{y}} - \mathbf{y} \right) \right)
\end{align}
$$

In [30]:
@add_to_class(SimpleLinearRegression)
def update(self, x, y_true, y_pred, lr : float):
    m = len(y_true)
    e = y_pred - y_true
    self.w -= lr * 2 / m * (x.T @ e)
    self.b -= lr * 2 / m * (torch.ones_like(y_true).T @ e)[0]

In [31]:
@add_to_class(SimpleLinearRegression)
def fit(self, x_train, y_train, epochs : int, lr : float, batch_size : int, x_valid, y_valid):
    for i in range(epochs):
        for batch in range(0, len(x_train), batch_size):
            x_t = torch.tensor(x_train[batch:batch+batch_size])
            y_t = torch.tensor(y_train[batch:batch+batch_size])

            y_p = self.predict(x_t)
            
            self.update(x_t, y_t, y_p, lr)

        loss_v = round(self.evaluate(x_valid, y_valid), 4)
        print('iter: {} - MSEv: {}'.format(i, loss_v))

# Model

### Create train, validation data

In [32]:
X_train, Y_train = X[:85], Y[:85]
X_valid, Y_valid = X[85:], Y[85:]

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

(85, 4) (85, 1)
(15, 4) (15, 1)


## Tensorflow model

In [33]:
LR = 0.001
EPOCHS = 5
BATCH = len(X_train) // 3
'''
BATCH <- M : [x_1, ..., x_M]
BATCH <- a : [x_1, ..., x_a]
'''

TFModel = tf.keras.Sequential([
    tf.keras.layers.Dense(units=U, activation='linear'),
])

TFModel.compile(
    loss = tf.keras.losses.MSE,
    optimizer = tf.keras.optimizers.SGD(learning_rate=LR)
)

TFModel.evaluate(X[:1], Y[:1])

TFModel.summary()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 964ms/step - loss: 6313.1553


## My model scratch

In [34]:
MyModel = SimpleLinearRegression(N)
MyModel.copyParams(TFModel)

Done


## Comparative

### Prediction

In [35]:
tf_predict = TFModel.predict(X_train, batch_size=len(X_train))
my_predict = MyModel.predict(X_train)

print(np.mean(np.abs((tf_predict - my_predict.numpy()) / tf_predict)) * 100)

del tf_predict
del my_predict

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
1.0493574959227039e-14


### MSE loss function

In [36]:
tf_loss = TFModel.evaluate(X_train, Y_train, batch_size=len(X_train))
my_loss = MyModel.evaluate(X_train, Y_train)

print(np.mean(np.abs((tf_loss - my_loss) / tf_loss)) * 100)

del tf_loss
del my_loss

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - loss: 9896.7295
1.837970199267847e-14


### Fit

In [37]:
TFModel.fit(X_train, Y_train, batch_size=BATCH, epochs=EPOCHS, shuffle=False,
            validation_data=(X_valid, Y_valid))

Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 154ms/step - loss: 8717.1377 - val_loss: 9188.6982
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 8610.5986 - val_loss: 9067.4307
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 8505.5195 - val_loss: 8947.8994
Epoch 4/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 8401.8779 - val_loss: 8830.0781
Epoch 5/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - loss: 8299.6514 - val_loss: 8713.9404


<keras.src.callbacks.history.History at 0x2c1e8793910>

In [38]:
MyModel.fit(X_train, Y_train, EPOCHS, LR, BATCH, X_valid, Y_valid)

iter: 0 - MSEv: 9188.698
iter: 1 - MSEv: 9067.4306
iter: 2 - MSEv: 8947.8996
iter: 3 - MSEv: 8830.0782
iter: 4 - MSEv: 8713.9402


  return torch.tensor(x) @ self.w + self.b


In [39]:
tf_new_w = TFModel.weights[0].numpy()
my_new_w = MyModel.w.numpy()

print(np.mean(np.abs((tf_new_w - my_new_w) / tf_new_w)) * 100)

del tf_new_w
del my_new_w

tf_new_b = TFModel.weights[1].numpy()
my_new_b = MyModel.b.numpy()

print(np.mean(np.abs((tf_new_b - my_new_b) / tf_new_b)) * 100)

del tf_new_b
del my_new_b

1.1937113788916998e-14
2.851870730117414e-14


# Full train

In [40]:
model = SimpleLinearRegression(N)
print(model.b)
print(model.w)

tensor([-0.5895])
tensor([[-0.0233],
        [-0.7321],
        [-0.5472],
        [ 0.2832]])


In [41]:
model.evaluate(X_valid, Y_valid)

9290.23131847044

In [42]:
model.fit(X_train, Y_train, 15, 0.01, 1, X_valid, Y_valid)

  return torch.tensor(x) @ self.w + self.b


iter: 0 - MSEv: 383.6558
iter: 1 - MSEv: 25.7435
iter: 2 - MSEv: 2.9641
iter: 3 - MSEv: 1.191
iter: 4 - MSEv: 1.0121
iter: 5 - MSEv: 0.9854
iter: 6 - MSEv: 0.9798
iter: 7 - MSEv: 0.9784
iter: 8 - MSEv: 0.9781
iter: 9 - MSEv: 0.978
iter: 10 - MSEv: 0.978
iter: 11 - MSEv: 0.978
iter: 12 - MSEv: 0.978
iter: 13 - MSEv: 0.978
iter: 14 - MSEv: 0.978


In [43]:
print(model.b)
print(model.w)

tensor([0.2503])
tensor([[85.4215],
        [49.3152],
        [35.9923],
        [ 0.1085]])


In [44]:
print(TRUE_B)
print(TRUE_W)

0.43370068789684035
[85.44456309 49.37237061 36.00096697  0.        ]


In [45]:
print(np.mean(np.abs((TRUE_B - model.w.numpy()) / TRUE_B)) * 100)
print(np.mean(np.abs((TRUE_B - model.b.numpy()) / TRUE_B)) * 100)

9785.152469275066
42.27808520735004


In [46]:
model.evaluate(X_valid, Y_valid)

0.9779600866966783