In [1]:
import torch
import tensorflow as tf
import numpy as np

tf.keras.config.set_floatx('float64')
torch.set_default_dtype(torch.float64)

In [2]:
def add_to_class(Class):  
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

# Create model

## Data
$$
\mathbf{X} \in \mathbb{R}^{M,N} \\
\mathbf{y} \in \mathbb{R}^{M}
$$

In [3]:
from sklearn.datasets import make_regression
import random

M, N, U = 100, 4, 1
TRUE_B = random.random()
X, Y, TRUE_W = make_regression(n_samples=M, n_features=N, n_targets=U, n_informative=N-1, bias=TRUE_B, noise=1, coef=True)

if U == 1: Y = Y.reshape((-1, 1))

print(X.shape)
print(Y.shape)

print(TRUE_B)
print(TRUE_W.shape)

(100, 4)
(100, 1)
0.32968640688761863
(4,)


## weights
Parameters trainables:
$$
\mathbf{w} \in \mathbb{R}^{N} \\
b \in \mathbb{R}
$$

In [4]:
class SimpleLinearRegression:
    def __init__(self, number_features : int):
        self.w = torch.randn(number_features, 1)
        self.b = torch.randn(1)

    def copyParams(self, tf_model):
        self.w.copy_(torch.tensor(tf_model.weights[0].numpy()))
        self.b.copy_(torch.tensor(tf_model.weights[1].numpy()))
        print('Done')

## weigthed sum:
$$
\mathbf{\hat{y}} \left( \mathbf{X} \right) = \mathbf{X} \mathbf{w} + b\\
\mathbf{\hat{y}} : \mathbb{R}^{M,N} \rightarrow \mathbb{R}^{M}
$$

In [5]:
@add_to_class(SimpleLinearRegression)
def predict(self, x):
    return torch.tensor(x) @ self.w + self.b

## MSE
Loss function: Mean Squared Error
$$
L\left ( \mathbf{\hat{y}} \right ) = \frac{1}{M} \sum_{i=1}^{M} \left ( \mathbf{\hat{y}}_i - \mathbf{y}_i \right )^{2} + \lambda \left \| \boldsymbol{w}\right \|_2^2 \\
L : \mathbb{R}^{M} \rightarrow \mathbb{R}
$$

Vectorized form:
$$
\begin{align}
L\left ( \mathbf{\hat{y}} \right ) &= \frac{1}{M} \left ( \mathbf{e}^{T} \mathbf{e} \right )  + \lambda \left \| \boldsymbol{w} \right \|_2^2\\
\mathbf{e} &:= \mathbf{\hat{y}} - \mathbf{y}
\end{align}
$$

Note: $\lambda$ is called "Hyperparameters".

In [6]:
@add_to_class(SimpleLinearRegression)
def __evaluate__(self, x, y_true):
    # original loss
    e = self.predict(x) - torch.tensor(y_true)
    return ((e.T @ e) / e.numel()).item()

@add_to_class(SimpleLinearRegression)
def evaluate(self, x, y_true):
    # new loss
    return (self.__evaluate__(x, y_true) + self.lambd * (self.w**2).sum()).item()

## Gradient
$$
\frac{\partial L}{\partial \boldsymbol{w}} = 
\frac{\partial }{\partial \boldsymbol{w}} \left ({\color{Red} \frac{1}{M} (\boldsymbol{e}^T \boldsymbol{e})} \right)
+ \frac{\partial }{\partial \boldsymbol{w}} \left( {\color{Blue} \lambda \left \| \boldsymbol{w} \right \|_2^2} \right)
$$
where the ${\color{Red} \text{red}}$ part is the original MSE loss function and the ${\color{Blue} \text{blue}}$ is the regularizer.

Therefore:
$$
\frac{\partial }{\partial \boldsymbol{w}} \left ( \frac{1}{M} (\boldsymbol{e}^T \boldsymbol{e}) \right) = \frac{2}{M} \mathbf{X}^{T} \left( \mathbf{\hat{y}} - \mathbf{y} \right)
$$

and:
$$
\frac{\partial }{\partial \boldsymbol{w}} \left( \lambda \left \| \boldsymbol{w} \right \|_2^2 \right) = 2 \lambda \boldsymbol{w}
$$

## Update weights

$$
\begin{align}
\mathbf{w} & \leftarrow \mathbf{w} - \alpha \nabla_{\mathbf{w}} L = \boldsymbol{w} - \alpha \left( \frac{2}{M} \mathbf{X}^{T} \left( \mathbf{\hat{y}} - \mathbf{y} \right) + 2 \lambda \boldsymbol{w} \right)\\
b & \leftarrow b - \alpha \nabla_{b} L = b - \alpha \left(\frac{2}{M} \mathbf{1} \left( \mathbf{\hat{y}} - \mathbf{y} \right) \right)
\end{align}
$$

In [7]:
@add_to_class(SimpleLinearRegression)
def update(self, x, y_true, y_pred, lr : float):
    m = len(y_true)
    e = y_pred - y_true
    self.w -= lr * (2 / m * (x.T @ e) + (2 * self.lambd * self.w))
    self.b -= lr * 2 / m * (torch.ones_like(y_true).T @ e)[0]

In [8]:
@add_to_class(SimpleLinearRegression)
def fit(self, x_train, y_train, epochs : int, lr : float, batch_size : int, x_valid, y_valid):
    self.loss_history = list()
    self.loss_v_history = list()
    for i in range(epochs):
        for batch in range(0, len(x_train), batch_size):
            x_t = torch.tensor(x_train[batch:batch+batch_size])
            y_t = torch.tensor(y_train[batch:batch+batch_size])

            y_p = self.predict(x_t)
            
            self.update(x_t, y_t, y_p, lr)

        self.loss_history.append(self.evaluate(x_train, y_train))
        self.loss_v_history.append(self.evaluate(x_valid, y_valid))
        print('iter: {} - MSE: {} - MSEv: {}'.format(i, round(self.loss_history[-1],4), round(self.loss_v_history[-1],4)))

# Model

### Create train, validation data

In [9]:
X_train, Y_train = X[:85], Y[:85]
X_valid, Y_valid = X[85:], Y[85:]

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

(85, 4) (85, 1)
(15, 4) (15, 1)


## Tensorflow model

In [10]:
LAMBD = 0.03
LR = 0.01
EPOCHS = 5
BATCH = len(X_train) // 3
'''
BATCH <- M : [x_1, ..., x_M]
BATCH <- a : [x_1, ..., x_a]
'''

TFModel = tf.keras.Sequential([
    tf.keras.layers.Dense(units=U, 
                          activation='linear',
                          kernel_regularizer = tf.keras.regularizers.L2(l2=LAMBD)),
])

TFModel.compile(
    loss = tf.keras.losses.MSE,
    optimizer = tf.keras.optimizers.SGD(learning_rate=LR)
)

TFModel.evaluate(X_train[:1], Y_train[:1])

TFModel.summary()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 617ms/step - loss: 5097.5269


## My model scratch

In [11]:
MyModel = SimpleLinearRegression(N)
MyModel.copyParams(TFModel)

MyModel.lambd = LAMBD

Done


## Comparative

### Prediction

In [12]:
tf_predict = TFModel.predict(X_train, batch_size=len(X_train))
my_predict = MyModel.predict(X_train)

print(np.mean(np.abs((tf_predict - my_predict.numpy()) / tf_predict)) * 100)

del tf_predict
del my_predict

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
5.948146695248522e-15


### MSE loss function

In [13]:
tf_loss = TFModel.evaluate(X_train, Y_train, batch_size=len(X_train))
my_loss = MyModel.evaluate(X_train, Y_train)

print(np.mean(np.abs((tf_loss - my_loss) / tf_loss)) * 100)

del tf_loss
del my_loss

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - loss: 2703.1289
1.6822999702075096e-14


### Fit

In [14]:
TFModel.fit(X_train, Y_train, batch_size=BATCH, epochs=EPOCHS, shuffle=False,
            validation_data=(X_valid, Y_valid))

Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - loss: 3079.9163 - val_loss: 1843.1484
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 177ms/step - loss: 2306.4426 - val_loss: 1498.0261
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - loss: 1800.2439 - val_loss: 1250.0793
Epoch 4/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 1457.3982 - val_loss: 1065.4913
Epoch 5/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 1215.9447 - val_loss: 923.3408


<keras.src.callbacks.history.History at 0x1c8565096d0>

In [15]:
MyModel.fit(X_train, Y_train, EPOCHS, LR, BATCH, X_valid, Y_valid)

iter: 0 - MSE: 2124.41 - MSEv: 1843.1484
iter: 1 - MSE: 1725.384 - MSEv: 1498.0261
iter: 2 - MSE: 1439.2224 - MSEv: 1250.0794
iter: 3 - MSE: 1225.627 - MSEv: 1065.4913
iter: 4 - MSE: 1060.0559 - MSEv: 923.3408


  return torch.tensor(x) @ self.w + self.b


In [16]:
tf_new_w = TFModel.weights[0].numpy()
my_new_w = MyModel.w.numpy()

print(np.mean(np.abs((tf_new_w - my_new_w) / tf_new_w)) * 100)

del tf_new_w
del my_new_w

tf_new_b = TFModel.weights[1].numpy()
my_new_b = MyModel.b.numpy()

print(np.mean(np.abs((tf_new_b - my_new_b) / tf_new_b)) * 100)

del tf_new_b
del my_new_b

1.2040176509456703e-14
0.0


# Full train

## $\lambda \to 0$

In [17]:
model1 = SimpleLinearRegression(N)

model1.lambd = 0

print(model1.b)
print(model1.w)

B_copy = model1.b.clone()
W_copy = model1.w.clone()

tensor([-0.5703])
tensor([[ 1.0227],
        [ 0.7532],
        [ 0.1319],
        [-0.0224]])


In [18]:
model1.fit(X_train, Y_train, 15, 0.01, 1, X_valid, Y_valid)

  return torch.tensor(x) @ self.w + self.b


iter: 0 - MSE: 87.785 - MSEv: 66.8598
iter: 1 - MSE: 4.3427 - MSEv: 3.3519
iter: 2 - MSE: 1.1225 - MSEv: 0.7832
iter: 3 - MSE: 0.9854 - MSEv: 0.5888
iter: 4 - MSE: 0.9791 - MSEv: 0.5529
iter: 5 - MSE: 0.9788 - MSEv: 0.5426
iter: 6 - MSE: 0.9788 - MSEv: 0.5393
iter: 7 - MSE: 0.9788 - MSEv: 0.5382
iter: 8 - MSE: 0.9788 - MSEv: 0.5378
iter: 9 - MSE: 0.9788 - MSEv: 0.5377
iter: 10 - MSE: 0.9788 - MSEv: 0.5376
iter: 11 - MSE: 0.9788 - MSEv: 0.5376
iter: 12 - MSE: 0.9788 - MSEv: 0.5376
iter: 13 - MSE: 0.9788 - MSEv: 0.5376
iter: 14 - MSE: 0.9788 - MSEv: 0.5376


In [19]:
my_new_w = model1.w.numpy()

print(np.mean(np.abs((TRUE_W - my_new_w + 1e-16) / (TRUE_W + 1e-16))) * 100)

del my_new_w

my_new_b = model1.b.numpy()

print(np.mean(np.abs((TRUE_B - my_new_b) / TRUE_B)) * 100)

del my_new_b

4.59282878003542e+18
7.2653245400988515


In [20]:
model2 = SimpleLinearRegression(N)

model2.lambd = 3

model2.b.copy_(B_copy)
model2.w.copy_(W_copy)

print(model2.b)
print(model2.w)

tensor([-0.5703])
tensor([[ 1.0227],
        [ 0.7532],
        [ 0.1319],
        [-0.0224]])


In [21]:
model2.fit(X_train, Y_train, 15, 0.01, 1, X_valid, Y_valid)

  return torch.tensor(x) @ self.w + self.b


iter: 0 - MSE: 1346.354 - MSEv: 1885.6678
iter: 1 - MSE: 1352.1553 - MSEv: 1891.811
iter: 2 - MSE: 1353.4237 - MSEv: 1892.919
iter: 3 - MSE: 1353.6516 - MSEv: 1893.1185
iter: 4 - MSE: 1353.6921 - MSEv: 1893.154
iter: 5 - MSE: 1353.6993 - MSEv: 1893.1603
iter: 6 - MSE: 1353.7006 - MSEv: 1893.1614
iter: 7 - MSE: 1353.7008 - MSEv: 1893.1616
iter: 8 - MSE: 1353.7009 - MSEv: 1893.1617
iter: 9 - MSE: 1353.7009 - MSEv: 1893.1617
iter: 10 - MSE: 1353.7009 - MSEv: 1893.1617
iter: 11 - MSE: 1353.7009 - MSEv: 1893.1617
iter: 12 - MSE: 1353.7009 - MSEv: 1893.1617
iter: 13 - MSE: 1353.7009 - MSEv: 1893.1617
iter: 14 - MSE: 1353.7009 - MSEv: 1893.1617


In [22]:
my_new_w = model2.w.numpy()

print(np.mean(np.abs((TRUE_W - my_new_w + 1e-16) / (TRUE_W + 1e-16))) * 100)

del my_new_w

my_new_b = model2.b.numpy()

print(np.mean(np.abs((TRUE_B - my_new_b) / TRUE_B)) * 100)

del my_new_b

1.3985397564566083e+18
1154.1058337864947


In [23]:
model3 = SimpleLinearRegression(N)

model3.lambd = 0.001

model3.b.copy_(B_copy)
model3.w.copy_(W_copy)

print(model3.b)
print(model3.w)

tensor([-0.5703])
tensor([[ 1.0227],
        [ 0.7532],
        [ 0.1319],
        [-0.0224]])


In [24]:
model3.fit(X_train, Y_train, 15, 0.01, 1, X_valid, Y_valid)

  return torch.tensor(x) @ self.w + self.b


iter: 0 - MSE: 88.2303 - MSEv: 69.1165
iter: 1 - MSE: 4.484 - MSEv: 6.0747
iter: 2 - MSE: 1.1522 - MSEv: 3.5678
iter: 3 - MSE: 0.9905 - MSEv: 3.3842
iter: 4 - MSE: 0.9793 - MSEv: 3.3506
iter: 5 - MSE: 0.9781 - MSEv: 3.3409
iter: 6 - MSE: 0.978 - MSEv: 3.3378
iter: 7 - MSE: 0.9779 - MSEv: 3.3367
iter: 8 - MSE: 0.9779 - MSEv: 3.3363
iter: 9 - MSE: 0.9779 - MSEv: 3.3362
iter: 10 - MSE: 0.9779 - MSEv: 3.3362
iter: 11 - MSE: 0.9779 - MSEv: 3.3362
iter: 12 - MSE: 0.9779 - MSEv: 3.3362
iter: 13 - MSE: 0.9779 - MSEv: 3.3362
iter: 14 - MSE: 0.9779 - MSEv: 3.3362


In [25]:
my_new_w = model3.w.numpy()

print(np.mean(np.abs((TRUE_W - my_new_w + 1e-16) / (TRUE_W + 1e-16))) * 100)

del my_new_w

my_new_b = model3.b.numpy()

print(np.mean(np.abs((TRUE_B - my_new_b) / TRUE_B)) * 100)

del my_new_b

4.5878620720961987e+18
8.676180964243684
