In [44]:
import numpy as np
import torch
import tensorflow as tf

In [45]:
torch.set_default_dtype(torch.float64)
tf.keras.config.set_floatx('float64')

In [46]:
def add_to_class(Class):  
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

# Create dataset

Dataset
$$
\begin{align*}
\mathbf{X} &\in \mathbb{R}^{M \times N} \\
\mathbf{y} &\in \mathbb{R}^{M}
\end{align*}
$$

In [47]:
from sklearn.datasets import make_regression
import random

M: int = 100 #number of samples
N: int = 4 #number of features

TRUE_B = random.random()

X, Y, TRUE_W = make_regression(n_samples=M, n_features=N, n_targets=1,
                               n_informative=N-1, bias=TRUE_B, noise=1, coef=True)

print(X.shape)
print(Y.shape)
print(TRUE_W.shape)

(100, 4)
(100,)
(4,)


# Model

## weights

Trainables parameters
$$
\begin{align*}
\mathbf{w} &\in \mathbb{R}^{N} \\
b &\in \mathbb{R}
\end{align*}
$$

In [48]:
class SimpleLinearRegression:
    def __init__(self, num_features: int) -> None:
        self.w = torch.randn(num_features)
        self.b = torch.randn(1)

    def copy_params(self, tf_model) -> None:
        """Copy the parameters from a TensorFlow model to this PyTorch model.

        Args:
            tf_model: A TensorFlow model from which to copy the parameters.

        Returns:
            None
        """
        self.w.copy_(torch.tensor(tf_model.weights[0].numpy()[:,0]))
        self.b.copy_(torch.tensor(tf_model.weights[1].numpy()))

## weighted sum

$$
\begin{align*}
\mathbf{\hat{y}}(\mathbf{X}) = \mathbf{X}\mathbf{w} + b \\
\mathbf{\hat{y}} : \mathbb{R}^{M \times N} \rightarrow 
\mathbb{R}^{M}
\end{align*}
$$

In [49]:
@add_to_class(SimpleLinearRegression)
def predict(self, x: torch.Tensor) -> torch.Tensor:
    """Predict the output for input x.

    Args:
        x: Input tensor of shape (n_samples, num_features).

    Returns:
        y_pred: Predicted output tensor of shape (n_samples,).
    """
    y_pred = torch.matmul(x, self.w) + self.b
    return y_pred

## MSE

Loss function: Mean Squared Error:
$$
\begin{align*}
L(\mathbf{\hat{y}}) &= \frac{1}{M} \sum_{i=1}^{M}(
    \hat{y}_i - \mathbf{\mathbb{y}}_i)^{2} \\
L &: \mathbb{R}^{M} \rightarrow \mathbb{R}
\end{align*}
$$
Vectorized form:
$$
\begin{align*}
L(\mathbf{\hat{y}}) &= \frac{1}{M} 
    \left\| \mathbf{e} \right\|_{2}^2 \\
\mathbf{e} &:= \mathbf{\hat{y}} - \mathbf{y}
\end{align*}
$$

In [50]:
@add_to_class(SimpleLinearRegression)
def evaluate(self, x: torch.Tensor, y_true: torch.Tensor) -> float:
    """Evaluate the model on input x and target y_true using MSE.

    Args:
        x: Input tensor of shape (n_samples, num_features).
        y_true: Target tensor of shape (n_samples,).

    Returns:
        loss: MSE loss between predictions and true values.
    """
    y_pred = self.predict(x)
    e = y_pred - y_true
    loss = torch.linalg.vector_norm(e, ord=2)**2
    return loss.item() / len(y_true)

## Gradient

Using **Numerator layout notation**.
Gradient descent is:
$$
\frac{\partial L}{\partial \mathbf{w}} =
\frac{\partial L}{\partial \mathbf{\hat{y}}}
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}}
$$
and
$$
\frac{\partial L}{\partial b} =
\frac{\partial L}{\partial \mathbf{\hat{y}}}
\frac{\partial \mathbf{\hat{y}}}{\partial b}
$$
where their shaper are:
$$
\frac{\partial L}{\partial \mathbf{w}} \in \mathbb{R}^{N},
\frac{\partial L}{\partial b} \in \mathbb{R},
\frac{\partial L}{\partial \mathbf{\hat{y}}} \in \mathbb{R}^{M},
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}} \in \mathbb{R}^{M \times N},
\frac{\partial \mathbf{\hat{y}}}{\partial b} \in \mathbb{R}^{M}
$$

### weighted sum derivative

$$
\begin{align*}
\mathbf{\hat{y}} &= \mathbf{X} \mathbf{w} + b \\
&= \begin{bmatrix}
        \mathbf{x}_{1}^{T} \\
        \mathbf{x}_{2}^{T} \\
        \vdots \\
        \mathbf{x}_{M}^{T} \\
    \end{bmatrix} \mathbf{w} + b \\
&= \begin{bmatrix}
        \mathbf{x}_{1}^{T} \mathbf{w} + b \\
        \mathbf{x}_{2}^{T} \mathbf{w} + b \\
        \vdots \\
        \mathbf{x}_{M}^{T} \mathbf{w} + b \\
    \end{bmatrix}
\end{align*}
$$
where
$$
\mathbf{x}_{i}^{T} = \begin{bmatrix}
        x_{i1} & x_{i2} & \cdots & x_{iN}
    \end{bmatrix}
$$

#### respect to $b$

$$
\begin{align*}
\frac{\partial \mathbf{\hat{y}}}{\partial b} &= \begin{bmatrix}
    \frac{\partial \hat{y}_{1}}{\partial b} \\
    \frac{\partial \hat{y}_{2}}{\partial b} \\
    \vdots \\
    \frac{\partial \hat{y}_{M}}{\partial b} \\
\end{bmatrix} \in \mathbb{R}^{M} \\
&= \boldsymbol{1} 
\end{align*}
$$

#### respecto to $\mathbf{w}$

$$
\begin{align*}
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}} = 
\begin{bmatrix}
    \frac{\partial \hat{y}_{1}}{\partial w_{1}} &
    \frac{\partial \hat{y}_{1}}{\partial w_{2}} &
    \cdots &
    \frac{\partial \hat{y}_{1}}{\partial w_{N}} \\
    \frac{\partial \hat{y}_{2}}{\partial w_{1}} &
    \frac{\partial \hat{y}_{2}}{\partial w_{2}} &
    \cdots &
    \frac{\partial \hat{y}_{2}}{\partial w_{N}} \\
    \vdots & \vdots & \ddots & \vdots \\
    \frac{\partial \hat{y}_{M}}{\partial w_{1}} &
    \frac{\partial \hat{y}_{M}}{\partial w_{2}} &
    \cdots &
    \frac{\partial \hat{y}_{M}}{\partial w_{N}} \\
\end{bmatrix}
\end{align*}
$$
where
$$
\begin{align*}
\frac{\partial \hat{y}_{i}}{\partial w_{j}} &=
\frac{\partial}{\partial w_{j}} \left(
    x_{i1}w_{1} + x_{i2}w_{2} + 
    \cdots + x_{ij}w_{j} + \cdots +
    x_{iN}w_{N}
\right) \\
&= x_{ij}
\end{align*}
$$
therefore
$$
\begin{align*}
\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}} &= 
\begin{bmatrix}
    x_{11} & x_{12} & \cdots & x_{1N} \\
    x_{21} & x_{22} & \cdots & x_{2N} \\
    \vdots & \vdots & \ddots & \vdots \\
    x_{M1} & x_{M2} & \cdots & x_{MN} \\
\end{bmatrix} \\
&= \mathbf{X}
\end{align*}
$$

### MSE derivative

$$
\frac{\partial L}{\partial \mathbf{\hat{y}}} =
\frac{\partial L}{\partial \mathbf{e}}
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}}
$$
where
$$
\frac{\partial L}{\partial \mathbf{e}} \in \mathbb{R}^{M},
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}} \in \mathbb{R}^{M \times M}
$$

#### respecto to $\mathbf{e}$

$$
\begin{align*}
L &= \frac{1}{M} \left\| \mathbf{e} \right\|_{2}^{2} \\
&= \frac{1}{M} \left( \mathbf{e}^{T} \mathbf{e} \right) \\
&= \frac{1}{M} \left( e_{1}^{2} + e_{2}^{2} + \cdots + e_{M}^{2} \right)
\end{align*}
$$
then
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{e}} &= \begin{bmatrix}
    \frac{\partial L}{\partial e_{1}} &
    \frac{\partial L}{\partial e_{2}} &
    \cdots &
    \frac{\partial L}{\partial e_{M}} \\
\end{bmatrix}^*
\end{align*}
$$
***Remark**: This looks like a matrix (or row vector) of shape $\mathbb{R}^{1 \times M}$, but the first axis does not add new information, therefore we drop this axis.
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{e}} &= \begin{bmatrix}
    \frac{\partial L}{\partial e_{1}} &
    \frac{\partial L}{\partial e_{2}} &
    \cdots &
    \frac{\partial L}{\partial e_{M}} \\
\end{bmatrix} =
\begin{bmatrix}
    \frac{\partial L}{\partial e_{1}} \\
    \frac{\partial L}{\partial e_{2}} \\
    \vdots \\
    \frac{\partial L}{\partial e_{M}} \\
\end{bmatrix} \in \mathbb{R}^{M}
\end{align*}
$$
therefore
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{e}} &= \frac{2}{M} \begin{bmatrix}
    e_{1} \\
    e_{2} \\
    \vdots \\
    e_{M}
\end{bmatrix} \\
&= \frac{2}{M} \mathbf{e}
\end{align*}
$$

#### respect to $\mathbf{\hat{y}}$

$$
\begin{align*}
\mathbf{e} &= \begin{bmatrix}
    \hat{y}_{1} - y_{1} \\
    \hat{y}_{2} - y_{2} \\
    \vdots \\
    \hat{y}_{M} - y_{M}
\end{bmatrix}
\end{align*}
$$
then
$$
\begin{align*}
\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}} &= \begin{bmatrix}
    \frac{\partial e_{1}}{\partial \hat{y}_{1}} &
    \frac{\partial e_{1}}{\partial \hat{y}_{2}} &
    \cdots &
    \frac{\partial e_{1}}{\partial \hat{y}_{M}} \\
    \frac{\partial e_{2}}{\partial \hat{y}_{1}} &
    \frac{\partial e_{2}}{\partial \hat{y}_{2}} &
    \cdots &
    \frac{\partial e_{2}}{\partial \hat{y}_{M}} \\
    \vdots & \vdots & \ddots & \vdots \\
    \frac{\partial e_{M}}{\partial \hat{y}_{1}} &
    \frac{\partial e_{M}}{\partial \hat{y}_{2}} &
    \cdots &
    \frac{\partial e_{M}}{\partial \hat{y}_{M}} \\
\end{bmatrix} \\
&= \boldsymbol{I}
\end{align*}
$$
because the ith row of $\mathbf{e}$ only depends on the ith row of $\mathbf{y}$, it does not depend on any other row of $\mathbf{y}$.
$$
\frac{\partial e_{i}}{\partial \hat{y}_{j}} = \begin{cases}
    1 & \text{ if } i=j \\
    0 & \text{ if } i \neq j\\
\end{cases}
$$

#### MSE derivative

$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{\hat{y}}} &=
{\color{cyan}\frac{\partial L}{\partial \mathbf{e}}}
{\color{orange}\frac{\partial \mathbf{e}}{\partial \mathbf{\hat{y}}}} \\
&= {\color{cyan}\frac{2}{M} \mathbf{e}}
{\color{orange}\boldsymbol{I}} \\
&= {\color{cyan}\frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y})}
\end{align*}
$$

### summary

$$
\begin{align*}
\nabla_{\mathbf{w}} L =
\frac{\partial L}{\partial \mathbf{w}} &=
{\color{cyan}\frac{\partial L}{\partial \mathbf{\hat{y}}}}
{\color{orange}\frac{\partial \mathbf{\hat{y}}}{\partial \mathbf{w}}} \\
&= {\color{cyan}\frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y})}
{\color{orange}\mathbf{X}}
\end{align*}
$$
and
$$
\begin{align*}
\nabla_{b} L =
\frac{\partial L}{\partial b} &=
{\color{cyan}\frac{\partial L}{\partial \mathbf{\hat{y}}}}
{\color{magenta}\frac{\partial \mathbf{\hat{y}}}{\partial b}} \\
&= {\color{cyan}\frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y})}
{\color{magenta}\boldsymbol{1}}
\end{align*}
$$

## Parameters update

$$
\begin{align}
\mathbf{w} &\leftarrow \mathbf{w} - \eta \nabla_{\mathbf{w}} L =
\mathbf{w} - \eta \left(
    \frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y}) \mathbf{X}
\right) \\
b &\leftarrow b - \eta \nabla_{b} L =
b - \eta \left(
    \frac{2}{M} (\mathbf{\hat{y}} - \mathbf{y}) \boldsymbol{1}
\right)
\end{align}
$$

In [51]:
@add_to_class(SimpleLinearRegression)
def update(self, x: torch.Tensor, y_true: torch.Tensor,
           y_pred: torch.Tensor, lr: float) -> None:
    """Update the model parameters.

    Args:
        x: Input tensor of shape (n_samples, num_features).
        y_true: Target tensor of shape (n_samples,).
        y_pred: Predicted output tensor of shape (n_samples,).
        lr: Learning rate.
    """
    delta = 2 * (y_pred - y_true) / len(y_true)
    self.w -= lr * torch.matmul(delta, x)
    #self.b -= lr * torch.dot(delta, torch.ones_like(y_true))
    self.b -= lr * delta.sum()

## Fit (Gradient descent)

We have assumed that we will use the entire dataset to update our parameters, 
but we can use only a fraction of the observations/examples in our dataset to update our parameters. <br>
There are mainly 3 ways to use Gradient descent (GD).
- batch GD
- stochastic GD (SGD)
- mini-batch GD

### batch GD

The batch GD uses all observations/examples to update our parameters:
$$
\begin{array}{l}
\textbf{Algorithm 1: batch Gradient Descent} \\
\textbf{for } t = 1 \text{ to } T \textbf{ do}\\
\quad \mathbf{\theta} \leftarrow \text{update}(\mathbf{X}, \mathbf{y}; \mathbf{\theta}) \\
\textbf{end for}
\end{array}
$$
where $T$ is the number of epochs. <br>
**Remark**: $\mathbf{\theta}$ is an arbitrary parameter, for this model we have to update $\mathbf{w}$ and $b$.

### stochastic GD (SGD)

The SGD for each epoch, we update our parameters for each observation/example that is in our dataset:
$$
\begin{array}{l}
\textbf{Algorithm 2: stochastic Gradient Descent (SGD)} \\
\textbf{for } t = 1 \text{ to } T \textbf{ do}\\
\quad \textbf{for } m = 1 \text{ to } M \textbf{ do} \\
\quad \quad \mathbf{\theta} \leftarrow \text{update}(\mathbf{X}_{m,:}, \mathbf{y}_{m,:}; \mathbf{\theta}) \\
\textbf{end for}
\end{array}
$$
where $\mathbf{X}_{m,:}$ and $\mathbf{y}_{m,:}$ are the mth observation/example of our dataset. <br>
**Note**: $\mathbf{X}_{m,:} \in \mathbb{R}^{1 \times N}$ and $\mathbf{y}_{m,:} \in \mathbb{R}^{1}$.

### mini-batch GD

The mini-batch GD is intermediate between SGD and batch GD since a fragment of 
the dataset larger than SGD but smaller than batch GD is used to update our parameters per epoch:
$$
\begin{array}{l}
\textbf{Algorithm 3: mini-batch Gradient Descent} \\
\textbf{for } t = 1 \text{ to } T \textbf{ do} \\
\quad m \leftarrow 1 \\
\quad \tilde{m} \leftarrow \mathcal{B} \\
\quad \textbf{while } m < M \textbf{ do} \\
\quad \quad \mathbf{\theta} \leftarrow \text{update}(\mathbf{X}_{m:\tilde{m},:}, \mathbf{y}_{m:\tilde{m},:}; \mathbf{\theta}) \\
\quad \quad m \leftarrow m + \mathcal{B} \\
\quad \quad \tilde{m} \leftarrow \tilde{m} + \mathcal{B} \\
\textbf{end for}
\end{array}
$$
where $\mathcal{B}$ is the number of minibatch that we want. <br>
where $\mathbf{X}_{m:\tilde{m},:}$ and $\mathbf{y}_{m:\tilde{m},:}$ are the $m$-th to $\tilde{m}$-th observations/examples. <br>
**Note**: If $\mathcal{B}=1$, then mini-batch GD becomes SGD. 
And if $\mathcal{B}=M$, then mini-batch GD becomes batch GD.

In [52]:
@add_to_class(SimpleLinearRegression)
def fit(self, x_train: torch.Tensor, y_train: torch.Tensor, 
        epochs: int, lr: float, batch_size: int, 
        x_valid: torch.Tensor, y_valid: torch.Tensor) -> None:
    """fit the model using gradient descent.

    Args:
        x_train: Input tensor of shape (n_samples, num_features).
        y_train: Target tensor of shape (n_samples,).
        epochs: Number of epochs to train.
        lr: learning rate (0, 1).
        batch_size: Int number of batch.
        x_valid: Input tensor of shape (n_valid_samples, num_features).
        y_valid: Input tensor of shape (n_valid_samples,).
    """
    for epoch in range(epochs):
        loss = 0
        num_batch = 0
        for batch in range(0, len(y_train), batch_size):
            num_batch += 1
            x_b = x_train[batch:batch+batch_size]
            y_b = y_train[batch:batch+batch_size]

            y_pred = self.predict(x_b)
            loss += self.evaluate(x_b, y_b)

            self.update(x_b, y_b, y_pred, lr)

        loss = round(loss / num_batch, 4)
        loss_v = round(self.evaluate(x_valid, y_valid), 4)
        print(f'epoch: {epoch} - MSE: {loss} - MSE_v: {loss_v}')

# Scratch vs TF

## Train and validation data

In [53]:
X_train, Y_train = torch.tensor(X[:85]), torch.tensor(Y[:85])
X_valid, Y_valid = torch.tensor(X[85:]), torch.tensor(Y[85:])

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

torch.Size([85, 4]) torch.Size([85])
torch.Size([15, 4]) torch.Size([15])


## Hyperparameters

In [54]:
LR = 0.001
EPOCHS = 6
BATCH = len(X_train) // 3

## models

### TF model

In [55]:
TFModel = tf.keras.Sequential([
    tf.keras.layers.Dense(units=1, activation='linear')
])

TFModel.compile(
    loss = tf.keras.losses.MSE,
    optimizer = tf.keras.optimizers.SGD(learning_rate=LR)
)

TFModel.evaluate(X[:1], Y[:1])

TFModel.summary()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 355ms/step - loss: 40809.1172


### Our model

In [56]:
model = SimpleLinearRegression(N)
model.copy_params(TFModel)

## Comparisons

In [57]:
def error(tensor_true, tensor_pred) -> float:
    """
     Calculates the percentage error between two tensors or floats.

     If the arguments are simple floats or ints, calculate the percentage error between them.
     If the arguments are Numpy ndarray and PyTorch tensor, calculate the percentage error between them.
     If the argumens are PyTorch tensors, calculate the percentage error between them.

     Args:
         tensor_true: The true tensor or true float.
         pred_tensor: The predicted tensor or the predicted float.

     Returns:
         The percentage error between the tensors or floats.
     """
    if isinstance(tensor_true, (float, int)) and isinstance(tensor_pred, (float, int)):
        return np.abs(tensor_true - tensor_pred) / np.abs(tensor_true) * 100
    elif type(tensor_true) is np.ndarray and type(tensor_pred) is torch.Tensor:
        e = np.abs(tensor_true[:,0] - tensor_pred.numpy()) / np.abs(tensor_true[:,0])
        return np.mean(e) * 100
    e = torch.abs(tensor_true - tensor_pred) / torch.abs(tensor_true)
    return torch.mean(e) * 100

### predict

In [58]:
tf_predict = TFModel.predict(X_train, batch_size=len(X_train))
predict = model.predict(X_train)

error(tf_predict, predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step


1.650808568743399e-14

### MSE

In [59]:
tf_predict = TFModel.evaluate(X_train, Y_train, batch_size=len(X_train))
predict = model.evaluate(X_train, Y_train)

error(tf_predict, predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - loss: 16131.5088


3.382800883432671e-14

### fit

In [60]:
TFModel.fit(X_train, Y_train, batch_size=BATCH, epochs=EPOCHS,
            shuffle=False, validation_data=(X_valid, Y_valid))

Epoch 1/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 14411.8545 - val_loss: 16204.4268
Epoch 2/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 14255.2051 - val_loss: 16088.6729
Epoch 3/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 14100.7217 - val_loss: 15973.4121
Epoch 4/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 13948.3633 - val_loss: 15858.6504
Epoch 5/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 13798.0869 - val_loss: 15744.3896
Epoch 6/6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 13649.8555 - val_loss: 15630.6338


<keras.src.callbacks.history.History at 0x2116c6d2e50>

In [61]:
TFModel.history.history['loss']

[12944.80773781999,
 12791.005804509703,
 12639.642102655596,
 12490.66350789638,
 12344.018326801,
 12199.656254338679]

In [62]:
model.fit(X_train, Y_train, EPOCHS, LR, BATCH, X_valid, Y_valid)

epoch: 0 - MSE: 12944.8077 - MSE_v: 16204.4264
epoch: 1 - MSE: 12791.0058 - MSE_v: 16088.6725
epoch: 2 - MSE: 12639.6421 - MSE_v: 15973.4124
epoch: 3 - MSE: 12490.6635 - MSE_v: 15858.65
epoch: 4 - MSE: 12344.0183 - MSE_v: 15744.3893
epoch: 5 - MSE: 12199.6563 - MSE_v: 15630.6339


In [63]:
tf_predict = TFModel.weights[0].numpy()
predict = model.w

error(tf_predict, predict)

0.0

In [64]:
tf_predict = TFModel.weights[1].numpy()[0]
predict = model.b.item()

error(tf_predict, predict)

0.0

# Full train

In [65]:
model2 = SimpleLinearRegression(N)

print(model2.b)
print(model2.w)

tensor([1.5672])
tensor([ 0.8851,  0.0267, -0.2394,  1.3298])


In [66]:
model2.evaluate(X_valid, Y_valid)

16355.892941456297

In [67]:
error(TRUE_W[None,:] + 1e-100, model2.w + 1e-100), error(TRUE_B, model2.b.item())

(99.45714892299446, 726.8472640135958)

In [68]:
model2.fit(X_train, Y_train, 100, 0.001, 1, X_valid, Y_valid)

epoch: 0 - MSE: 14104.3652 - MSE_v: 12716.8702
epoch: 1 - MSE: 10658.6472 - MSE_v: 9897.6332
epoch: 2 - MSE: 8106.0311 - MSE_v: 7709.0072
epoch: 3 - MSE: 6197.727 - MSE_v: 6007.5814
epoch: 4 - MSE: 4759.9822 - MSE_v: 4683.6717
epoch: 5 - MSE: 3669.6193 - MSE_v: 3652.8621
epoch: 6 - MSE: 2838.1062 - MSE_v: 2849.9034
epoch: 7 - MSE: 2201.0192 - MSE_v: 2224.2168
epoch: 8 - MSE: 1710.9636 - MSE_v: 1736.5232
epoch: 9 - MSE: 1332.7393 - MSE_v: 1356.2843
epoch: 10 - MSE: 1039.9887 - MSE_v: 1059.7408
epoch: 11 - MSE: 812.8367 - MSE_v: 828.4007
epoch: 12 - MSE: 636.2072 - MSE_v: 647.8683
epoch: 13 - MSE: 498.6064 - MSE_v: 506.9349
epoch: 14 - MSE: 391.2338 - MSE_v: 396.8726
epoch: 15 - MSE: 307.3263 - MSE_v: 310.8841
epoch: 16 - MSE: 241.67 - MSE_v: 243.6749
epoch: 17 - MSE: 190.2344 - MSE_v: 191.1204
epoch: 18 - MSE: 149.8963 - MSE_v: 150.0064
epoch: 19 - MSE: 118.2306 - MSE_v: 117.8274
epoch: 20 - MSE: 93.3507 - MSE_v: 92.6298
epoch: 21 - MSE: 73.7865 - MSE_v: 72.8898
epoch: 22 - MSE: 58.3906

In [69]:
error(TRUE_W[None,:] + 1e-100, model2.w + 1e-100), error(TRUE_B, model2.b.item())

(32.90080475829641, 43.1821057699658)