In [1]:
import numpy as np
import torch
import tensorflow as tf

In [2]:
torch.set_default_dtype(torch.float64)
tf.keras.config.set_floatx('float64')

In [3]:
def add_to_class(Class):  
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

# Create dataset

Dataset
$$
\begin{align*}
\mathbf{X} &\in \mathbb{R}^{M \times N} \\
\mathbf{y} &\in \mathbb{R}^{M}
\end{align*}
$$

In [4]:
from sklearn.datasets import make_regression
import random

M: int = 100 #number of samples
N: int = 4 #number of features

TRUE_B = random.random()

X, Y, TRUE_W = make_regression(n_samples=M, n_features=N, n_targets=1,
                               n_informative=N-1, bias=TRUE_B, noise=1, coef=True)

print(X.shape)
print(Y.shape)
print(TRUE_W.shape)

(100, 4)
(100,)
(4,)


# Model

## weights

Trainables parameters
$$
\begin{align*}
\mathbf{w} &\in \mathbb{R}^{N} \\
b &\in \mathbb{R}
\end{align*}
$$

In [5]:
class SimpleLinearRegression:
    def __init__(self, num_features: int, penalty: float) -> None:
        self.w = torch.randn(num_features)
        self.b = torch.randn(1)
        self.lambd = penalty

    def copy_params(self, tf_model) -> None:
        """Copy the parameters from a TensorFlow model to this PyTorch model.

        Args:
            tf_model: A TensorFlow model from which to copy the parameters.
            penalty: Penalty hyperparemeter for L2 regularization.

        Returns:
            None
        """
        self.w.copy_(torch.tensor(tf_model.weights[0].numpy()[:,0]))
        self.b.copy_(torch.tensor(tf_model.weights[1].numpy()))

## weighted sum

$$
\begin{align*}
\mathbf{\hat{y}}(\mathbf{X}) = \mathbf{X}\mathbf{w} + b \\
\mathbf{\hat{y}} : \mathbb{R}^{M \times N} \rightarrow 
\mathbb{R}^{M}
\end{align*}
$$

In [6]:
@add_to_class(SimpleLinearRegression)
def predict(self, x: torch.Tensor) -> torch.Tensor:
    """Predict the output for input x.

    Args:
        x: Input tensor of shape (n_samples, num_features).

    Returns:
        y_pred: Predicted output tensor of shape (n_samples,).
    """
    y_pred = torch.matmul(x, self.w) + self.b
    return y_pred

## MSE with weight decay

Loss function: Mean Squared Error with weight decay:
$$
\begin{align*}
L(\mathbf{\hat{y}}) &= \frac{1}{M} \sum_{i=1}^{M}(
    \hat{y}_i - \mathbf{\mathbb{y}}_i)^{2} 
+ \lambda \sum_{j=0}^{N} w_{j}^{2} \\
L &: \mathbb{R}^{M} \rightarrow \mathbb{R}
\end{align*}
$$
Vectorized form:
$$
\begin{align*}
L(\mathbf{\hat{y}}) &= \frac{1}{M} 
    \left\| \mathbf{e} \right\|_{2}^2 
+ \lambda \left\| \boldsymbol{w} \right\|_{2}^{2} \\
&= \frac{1}{M} (\mathbf{e}^T \mathbf{e}) + \lambda (\mathbf{w}^T \mathbf{w}) \\
\mathbf{e} &:= \mathbf{\hat{y}} - \mathbf{y}
\end{align*}
$$
where $\lambda$ is called regularization or penalty hyperparameter. <br>
**Note**: this type of weight decay is called $\mathit{L}_2$ Regularization. <br>
**Remark**: weight regularization only affect on $\mathbf{w}$ and not on $b$.

In [7]:
@add_to_class(SimpleLinearRegression)
def evaluate(self, x: torch.Tensor, y_true: torch.Tensor) -> float:
    """Evaluate the model on input x and target y_true using MSE.

    Args:
        x: Input tensor of shape (n_samples, num_features).
        y_true: Target tensor of shape (n_samples,).

    Returns:
        loss: MSE loss between predictions and true values.
    """
    y_pred = self.predict(x)
    e = y_pred - y_true
    loss = (e**2).sum()
    return loss.item() / len(y_true)

@add_to_class(SimpleLinearRegression)
def weight_decay_loss(self, x: torch.Tensor, y_true: torch.Tensor) -> float:
    """Evaluate the model on input x and target y_true using MSE with L2 regularization
    
    Args:
        x: Input tensor of shape (n_samples, num_features).
        y_true: Target tensor of shape (n_samples,)

    Returns:
        loss: MSE loss with L2 regularization between predictions and true values.
    """
    loss = self.evaluate(x, y_true)
    return loss + self.lambd * (self.w**2).sum().item()

# Gradient with $\mathit{L}_{2}$ regularization

$$
\frac{\partial L}{\partial \mathbf{w}} =
\frac{\partial}{\partial \mathbf{w}} \left(
    {\color{cyan} \frac{1}{M} (\mathbf{e}^T \mathbf{e})}
\right) 
+ \frac{\partial}{\partial \mathbf{w}} \left(
    {\color{orange} \lambda (\mathbf{w}^T \mathbf{w})}
\right) \\
$$
where the ${\color{cyan} \text{cyan part}}$ is the derivative of the original MSE loss function and the ${\color{orange} \text{orange part}}$ is the derivative of the regularizer. <br>
Therefore
$$
\frac{\partial}{\partial \mathbf{w}} \left(
    \frac{1}{M} (\mathbf{e}^T \mathbf{e})
\right) = \frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y}) \mathbf{X}
$$
and
$$
\frac{\partial}{\partial \mathbf{w}} \left(
    \lambda (\mathbf{w}^T \mathbf{w})
\right) = 2\lambda \mathbf{w}
$$

## summary

$$
\frac{\partial L}{\partial \mathbf{w}} =
{\color{cyan} \frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y}) \mathbf{X}}
+ {\color{orange} 2\lambda \mathbf{w}}
$$
**Note**: still that $\frac{\partial L}{\partial \mathbf{w}} \in \mathbb{R}^{N}$.

## Parameters update

$$
\begin{align}
\mathbf{w} &\leftarrow \mathbf{w} - \eta \nabla_{\mathbf{w}} L =
\mathbf{w} - \eta \left(
    \frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y}) \mathbf{X}
    + 2\lambda \mathbf{w}
\right) \\
b &\leftarrow b - \eta \nabla_{b} L =
b - \eta \left(
    \frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y}) \boldsymbol{1}
\right)
\end{align}
$$

In [8]:
@add_to_class(SimpleLinearRegression)
def update(self, x: torch.Tensor, y_true: torch.Tensor,
           y_pred: torch.Tensor, lr: float) -> None:
    """Update the model parameters.

    Args:
        x: Input tensor of shape (n_samples, num_features).
        y_true: Target tensor of shape (n_samples,).
        y_pred: Predicted output tensor of shape (n_samples,).
        lr: Learning rate.
    """
    delta = 2 * (y_pred - y_true) / len(y_true)
    self.b -= lr * delta.sum()
    self.w -= lr * (torch.matmul(delta, x) + 2 * self.lambd * self.w)

## Fit (Gradient descent)

In [9]:
@add_to_class(SimpleLinearRegression)
def fit(self, x_train: torch.Tensor, y_train: torch.Tensor, 
        epochs: int, lr: float, batch_size: int, 
        x_valid: torch.Tensor, y_valid: torch.Tensor) -> None:
    """fit the model using gradient descent.

    Args:
        x_train: Input tensor of shape (n_samples, num_features).
        y_train: Target tensor of shape (n_samples,).
        epochs: Number of epochs to train.
        lr: learning rate (0, 1).
        batch_size: Int number of batch.
        x_valid: Input tensor of shape (n_valid_samples, num_features).
        y_valid: Input tensor of shape (n_valid_samples,).
    """
    self.loss_fit = []
    for epoch in range(epochs):
        for batch in range(0, len(y_train), batch_size):
            x_b = x_train[batch:batch+batch_size]
            y_b = y_train[batch:batch+batch_size]

            y_pred = self.predict(x_b)

            self.update(x_b, y_b, y_pred, lr)
        self.loss_fit.append(self.weight_decay_loss(x_valid, y_valid))
        loss_v = round(self.loss_fit[-1], 4)
        print(f'epoch: {epoch} - MSE_v: {loss_v}')

# Scratch vs TF

## Train and validation data

In [10]:
X_train, Y_train = torch.tensor(X[:85]), torch.tensor(Y[:85])
X_valid, Y_valid = torch.tensor(X[85:]), torch.tensor(Y[85:])

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

torch.Size([85, 4]) torch.Size([85])
torch.Size([15, 4]) torch.Size([15])


## Hyperparameters

In [11]:
LR = 0.001
EPOCHS = 6
BATCH = len(X_train) // 3
LAMBD = 0.03

## models

### TF model

In [12]:
TFModel = tf.keras.Sequential([
    tf.keras.layers.Dense(units=1, 
                          activation='linear',
                          kernel_regularizer = tf.keras.regularizers.L2(
                              l2=LAMBD
                          ))
])

TFModel.compile(
    loss = tf.keras.losses.MSE,
    optimizer = tf.keras.optimizers.SGD(learning_rate=LR)
)

TFModel.evaluate(X[:1], Y[:1])

TFModel.summary()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 635ms/step - loss: 855.0676


### Our model

In [13]:
model = SimpleLinearRegression(N, LAMBD)
model.copy_params(TFModel)

## Comparisons

In [14]:
def error(tensor_true, tensor_pred) -> float:
    """
     Calculates the percentage error between two tensors or floats.

     If the arguments are simple floats or ints, calculate the percentage error between them.
     If the arguments are Numpy ndarray and PyTorch tensor, calculate the percentage error between them.
     If the argumens are PyTorch tensors, calculate the percentage error between them.

     Args:
         tensor_true: The true tensor or true float.
         pred_tensor: The predicted tensor or the predicted float.

     Returns:
         The percentage error between the tensors or floats.
     """
    if isinstance(tensor_true, (float, int)) and isinstance(tensor_pred, (float, int)):
        return np.abs(tensor_true - tensor_pred) / np.abs(tensor_true) * 100
    elif type(tensor_true) is np.ndarray and type(tensor_pred) is torch.Tensor:
        e = np.abs(tensor_true[:,0] - tensor_pred.numpy()) / np.abs(tensor_true[:,0])
        return np.mean(e) * 100
    e = torch.abs(tensor_true - tensor_pred) / torch.abs(tensor_true)
    return torch.mean(e) * 100

### predict

In [15]:
tf_predict = TFModel.predict(X_train, batch_size=len(X_train))
predict = model.predict(X_train)

error(tf_predict, predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step


1.2794064997109579e-14

### MSE without regularizer

In [16]:
tf_predict = TFModel.evaluate(X_train, Y_train, batch_size=len(X_train))
predict = model.evaluate(X_train, Y_train)

error(tf_predict, predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - loss: 8026.1792


0.0004926256535809091

### MSE with regularizer

In [17]:
tf_predict = TFModel.evaluate(X_train, Y_train, batch_size=len(X_train))
predict = model.weight_decay_loss(X_train, Y_train)

error(tf_predict, predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 445ms/step - loss: 8026.1792


0.0

### fit

In [18]:
TFModel.fit(X_train, Y_train, batch_size=BATCH, epochs=EPOCHS,
            shuffle=False, validation_data=(X_valid, Y_valid))

Epoch 1/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 7540.0088 - val_loss: 13256.6025
Epoch 2/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 7465.1602 - val_loss: 13135.8115
Epoch 3/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 7391.1411 - val_loss: 13016.1553
Epoch 4/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 7317.9414 - val_loss: 12897.6221
Epoch 5/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 7245.5508 - val_loss: 12780.2021
Epoch 6/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 7173.9590 - val_loss: 12663.8848


<keras.src.callbacks.history.History at 0x1a769cea050>

In [19]:
model.fit(X_train, Y_train, EPOCHS, LR, BATCH, X_valid, Y_valid)

epoch: 0 - MSE_v: 13256.6023
epoch: 1 - MSE_v: 13135.8117
epoch: 2 - MSE_v: 13016.1553
epoch: 3 - MSE_v: 12897.6225
epoch: 4 - MSE_v: 12780.2025
epoch: 5 - MSE_v: 12663.8848


In [20]:
tf_predict = TFModel.weights[0].numpy()
predict = model.w

error(tf_predict, predict)

4.4091358617918735e-15

In [21]:
tf_predict = TFModel.weights[1].numpy()[0]
predict = model.b.item()

error(tf_predict, predict)

0.0

# Diferents $\lambda$ case

## creating 3 models with same parameters initialization

Models
1. Underfitting (Excesive $\lambda$)
2. Appropiate weight decay (Medium $\lambda$)
3. Overfitting ($\lambda \rightarrow 0$)