In [1]:
import torch
import tensorflow as tf

In [2]:
torch.set_default_dtype(torch.float64)
tf.keras.config.set_floatx('float64')

In [3]:
def add_to_class(Class):  
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

# Create dataset

Dataset
$$
\begin{align*}
\mathbf{X} &\in \mathbb{R}^{M \times N} \\
\mathbf{y} &\in \mathbb{R}^{M}
\end{align*}
$$

In [4]:
from sklearn.datasets import make_regression
import random

M: int = 100 #number of samples
N: int = 4 #number of features

TRUE_B = random.random()

X, Y, TRUE_W = make_regression(n_samples=M, 
                               n_features=N, 
                               n_targets=1,
                               n_informative=N-1,
                               bias=TRUE_B,
                               noise=1,
                               coef=True)

print(X.shape)
print(Y.shape)
print(TRUE_W.shape)

(100, 4)
(100,)
(4,)


# Model

## wights

Trainables parameters
$$
\begin{align*}
\mathbf{w} &\in \mathbb{R}^{N} \\
b &\in \mathbb{R}
\end{align*}
$$

In [5]:
class SimpleLinearRegression:
    def __init__(self, num_features: int) -> None:
        self.w = torch.randn(num_features)
        self.b = torch.randn(1)

    def copy_params(self, tf_model) -> None:
        """Copy the parameters from a TensorFlow model to this PyTorch model.

        Args:
            tf_model: A TensorFlow model from which to copy the parameters.

        Returns:
            None
        """
        self.w.copy_(torch.tensor(tf_model.weights[0].numpy()))
        self.b.copy_(torch.tensor(tf_model.weights[1].numpy()))

## weighted sum

$$
\begin{align*}
\mathbf{\hat{y}}(\mathbf{X}) = \mathbf{X}\mathbf{w} + b \\
\mathbf{\hat{y}} : \mathbb{R}^{M \times N} \rightarrow \mathbb{R}^{M}
\end{align*}
$$

In [6]:
@add_to_class(SimpleLinearRegression)
def predict(self, x: torch.Tensor) -> torch.Tensor:
    """Predict the output for input x.

    Args:
        x: Input tensor of shape (n_samples, num_features).

    Returns:
        y_pred: Predicted output tensor of shape (n_samples,).
    """
    y_pred = torch.dot(x, self.w) + self.b
    return y_pred

## MSE

Loss function: Mean Squared Error:
$$
\begin{align*}
L(\mathbf{\hat{y}}) &= \frac{1}{M} \sum_{i=1}^{M}(
    \hat{y}_i - \mathbf{\mathbb{y}}_i)^{2} \\
L &: \mathbb{R}^{M} \rightarrow \mathbb{R}
\end{align*}
$$
Vectorized form:
$$
\begin{align*}
L(\mathbf{\hat{y}}) &= \frac{1}{M} 
    \left\| \mathbf{e} \right\|_{2}^2 \\
\mathbf{e} &:= \mathbf{\hat{y}} - \mathbf{y}
\end{align*}
$$

In [7]:
@add_to_class(SimpleLinearRegression)
def evaluate(self, x: torch.Tensor, y_true: torch.Tensor) -> float:
    """Evaluate the model on input x and target y_true.

    Args:
        x: Input tensor of shape (n_samples, num_features).
        y_true: Target tensor of shape (n_samples,).

    Returns:
        loss: Normalized L2 loss between predictions and true values.
    """
    y_pred = self.predict(x)
    e = y_pred - y_true
    loss = torch.linalg.vector_norm(e, ord=2)
    return loss.item() / len(y_true)

## Gradient

Using **Numerator layout notation**.
Gradient descent is:
$$
\frac{\partial L}{\partial \mathbf{w}} =
\frac{\partial L}{\partial \mathbf{\hat{y}}}
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}}
$$
and
$$
\frac{\partial L}{\partial b} =
\frac{\partial L}{\partial \mathbf{\hat{y}}}
\frac{\partial \mathbf{\hat{y}}}{\partial b}
$$
where their shaper are:
$$
\frac{\partial L}{\partial \mathbf{w}} \in \mathbb{R}^{N},
\frac{\partial L}{\partial b} \in \mathbb{R},
\frac{\partial L}{\partial \mathbf{\hat{y}}} \in \mathbb{R}^{M},
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}} \in \mathbb{R}^{M \times N},
\frac{\partial \mathbf{\hat{y}}}{\partial b} \in \mathbb{R}^{M}
$$

### weighted sum derivative

$$
\begin{align*}
\mathbf{\hat{y}} &= \mathbf{X} \mathbf{w} + b \\
&= \begin{bmatrix}
        \mathbf{x}_{1}^{T} \\
        \mathbf{x}_{2}^{T} \\
        \vdots \\
        \mathbf{x}_{M}^{T} \\
    \end{bmatrix} \mathbf{w} + b \\
&= \begin{bmatrix}
        \mathbf{x}_{1}^{T} \mathbf{w} + b \\
        \mathbf{x}_{2}^{T} \mathbf{w} + b \\
        \vdots \\
        \mathbf{x}_{M}^{T} \mathbf{w} + b \\
    \end{bmatrix}
\end{align*}
$$
where
$$
\mathbf{x}_{i}^{T} = \begin{bmatrix}
        x_{i1} & x_{i2} & \cdots & x_{iN}
    \end{bmatrix}
$$

#### respect to $b$

$$
\begin{align*}
\frac{\partial \mathbf{\hat{y}}}{\partial b} &= \begin{bmatrix}
    \frac{\partial \hat{y}_{1}}{\partial b} \\
    \frac{\partial \hat{y}_{2}}{\partial b} \\
    \vdots \\
    \frac{\partial \hat{y}_{M}}{\partial b} \\
\end{bmatrix} \in \mathbb{R}^{M} \\
&= \boldsymbol{1} 
\end{align*}
$$

#### respecto to $\mathbf{w}$

$$
\begin{align*}
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}} = 
\begin{bmatrix}
    \frac{\partial \hat{y}_{1}}{\partial w_{1}} &
    \frac{\partial \hat{y}_{1}}{\partial w_{2}} &
    \cdots &
    \frac{\partial \hat{y}_{1}}{\partial w_{N}} \\
    \frac{\partial \hat{y}_{2}}{\partial w_{1}} &
    \frac{\partial \hat{y}_{2}}{\partial w_{2}} &
    \cdots &
    \frac{\partial \hat{y}_{2}}{\partial w_{N}} \\
    \vdots & \vdots & \ddots & \vdots \\
    \frac{\partial \hat{y}_{M}}{\partial w_{1}} &
    \frac{\partial \hat{y}_{M}}{\partial w_{2}} &
    \cdots &
    \frac{\partial \hat{y}_{M}}{\partial w_{N}} \\
\end{bmatrix}
\end{align*}
$$
where
$$
\begin{align*}
\frac{\partial \hat{y}_{i}}{\partial w_{j}} &=
\frac{\partial}{\partial w_{j}} \left(
    x_{i1}w_{1} + x_{i2}w_{2} + 
    \cdots + x_{ij}w_{j} + \cdots +
    x_{iN}w_{N}
\right) \\
&= x_{ij}
\end{align*}
$$
therefore
$$
\begin{align*}
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}} &= 
\begin{bmatrix}
    x_{11} & x_{12} & \cdots & x_{1N} \\
    x_{21} & x_{22} & \cdots & x_{2N} \\
    \vdots & \vdots & \ddots & \vdots \\
    x_{M1} & x_{M2} & \cdots & x_{MN} \\
\end{bmatrix} \\
&= \mathbf{X}
\end{align*}
$$

### MSE derivative

$$
\frac{\partial L}{\partial \mathbf{\hat{y}}} =
\frac{\partial L}{\partial \mathbf{e}}
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}}
$$
where
$$
\frac{\partial L}{\partial \mathbf{e}} \in \mathbb{R}^{M},
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}} \in \mathbb{R}^{M \times M}
$$

#### respecto to $\mathbf{e}$

$$
\begin{align*}
L &= \frac{1}{M} \left\| \mathbf{e} \right\|_{2}^{2} \\
&= \frac{1}{M} \left( \mathbf{e}^{T} \mathbf{e} \right) \\
&= \frac{1}{M} \left( e_{1}^{2} + e_{2}^{2} + \cdots + e_{M}^{2} \right)
\end{align*}
$$
then
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{e}} &= \begin{bmatrix}
    \frac{\partial L}{\partial e_{1}} &
    \frac{\partial L}{\partial e_{2}} &
    \cdots &
    \frac{\partial L}{\partial e_{M}} \\
\end{bmatrix}^*
\end{align*}
$$
***Remark**: This looks like a matrix (or row vector) of shape $\mathbb{R}^{1 \times M}$, but the first axis does not add new information, therefore we drop this axis.
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{e}} &= \begin{bmatrix}
    \frac{\partial L}{\partial e_{1}} &
    \frac{\partial L}{\partial e_{2}} &
    \cdots &
    \frac{\partial L}{\partial e_{M}} \\
\end{bmatrix} =
\begin{bmatrix}
    \frac{\partial L}{\partial e_{1}} \\
    \frac{\partial L}{\partial e_{2}} \\
    \vdots \\
    \frac{\partial L}{\partial e_{M}} \\
\end{bmatrix} \in \mathbb{R}^{M}
\end{align*}
$$
therefore
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{e}} &= \frac{2}{M} \begin{bmatrix}
    e_{1} \\
    e_{2} \\
    \vdots \\
    e_{M}
\end{bmatrix} \\
&= \frac{2}{M} \mathbf{e}
\end{align*}
$$

#### respect to $\mathbf{\hat{y}}$

$$
\begin{align*}
\mathbf{e} &= \begin{bmatrix}
    \hat{y}_{1} - y_{1} \\
    \hat{y}_{2} - y_{2} \\
    \vdots \\
    \hat{y}_{M} - y_{M}
\end{bmatrix}
\end{align*}
$$
then
$$
\begin{align*}
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}} &= \begin{bmatrix}
    \frac{\partial e_{1}}{\partial \hat{y}_{1}} &
    \frac{\partial e_{1}}{\partial \hat{y}_{2}} &
    \cdots &
    \frac{\partial e_{1}}{\partial \hat{y}_{M}} \\
    \frac{\partial e_{2}}{\partial \hat{y}_{1}} &
    \frac{\partial e_{2}}{\partial \hat{y}_{2}} &
    \cdots &
    \frac{\partial e_{2}}{\partial \hat{y}_{M}} \\
    \vdots & \vdots & \ddots & \vdots \\
    \frac{\partial e_{M}}{\partial \hat{y}_{1}} &
    \frac{\partial e_{M}}{\partial \hat{y}_{2}} &
    \cdots &
    \frac{\partial e_{M}}{\partial \hat{y}_{M}} \\
\end{bmatrix} \\
&= \boldsymbol{I}
\end{align*}
$$
because the ith row of $\mathbf{e}$ only depends on the ith row of $\mathbf{y}$, it does not depend on any other row of $\mathbf{y}$.
$$
\frac{\partial e_{i}}{\partial \hat{y}_{j}} = \begin{cases}
    1 & \text{ if } i=j \\
    0 & \text{ if } i \neq j\\
\end{cases}
$$

#### MSE derivative

$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{\hat{y}}} &=
{\color{cyan}\frac{\partial L}{\partial \mathbf{e}}}
{\color{orange}\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}}} \\
&= {\color{cyan}\frac{2}{M} \mathbf{e}}
{\color{orange}\boldsymbol{I}} \\
&= {\color{cyan}\frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y})}
\end{align*}
$$

### summary

$$
\begin{align*}
\nabla_{\mathbf{w}} L =
\frac{\partial L}{\partial \mathbf{w}} &=
{\color{cyan}\frac{\partial L}{\partial \mathbf{\hat{y}}}}
{\color{orange}\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}}} \\
&= {\color{cyan}\frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y})}
{\color{orange}\mathbf{X}}
\end{align*}
$$
and
$$
\begin{align*}
\nabla_{b} L =
\frac{\partial L}{\partial b} &=
{\color{cyan}\frac{\partial L}{\partial \mathbf{\hat{y}}}}
{\color{magenta}\frac{\partial \mathbf{\hat{y}}}{\partial b}} \\
&= {\color{cyan}\frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y})}
{\color{magenta}\boldsymbol{1}}
\end{align*}
$$

## Parameters update

$$
\begin{align}
\mathbf{w} &\leftarrow \mathbf{w} - \alpha \nabla_{\mathbf{w}} L =
\mathbf{w} - \alpha \left(
    \frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y}) \mathbf{X}
\right) \\
b &\leftarrow b - \alpha \nabla_{b} L =
b - \alpha \left(
    \frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y}) \boldsymbol{1}
\right)
\end{align}

In [10]:
@add_to_class(SimpleLinearRegression)
def update(self, x: torch.Tensor, y_true: torch.Tensor,
           y_pred: torch.Tensor, lr: float) -> None:
    """Update the model parameters using gradient descent.

    Args:
        x: Input tensor of shape (n_samples, num_features).
        y_true: Target tensor of shape (n_samples,).
        y_pred: Predicted output tensor of shape (n_samples,).
        lr: Learning rate.
    """
    delta = 2 * (y_pred - y_true) / len(y_true)
    self.w -= lr * torch.dot(delta, x)
    #self.b -= lr * torch.dot(delta, torch.ones_like(y_true))
    self.b -= lr * delta.sum()

In [None]:
@add_to_class(SimpleLinearRegression)
def fit(self)