In [1]:
import numpy as np
import torch
import tensorflow as tf

In [2]:
torch.set_default_dtype(torch.float64)
tf.keras.config.set_floatx('float64')

In [3]:
def add_to_class(Class):  
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

# Dataset

## create dataset

In [4]:
from sklearn.datasets import make_classification

M: int = 100
N: int = 5
N_CLASS: int = 3

X, Y = make_classification(n_samples=M, n_features=N, n_classes=N_CLASS,
                           n_informative=N-1, n_redundant=0)

print(X.shape)
print(Y.shape)

(100, 5)
(100,)


## one hot encoding

In [5]:
Y_hat = torch.nn.functional.one_hot(torch.tensor(Y).long(), 3)
Y_hat.shape

torch.Size([100, 3])

## split dataset int train and valid

In [6]:
X_train, X_valid = torch.tensor(X[:85]), torch.tensor(X[85:])
Y_train, Y_valid = Y_hat[:85], Y_hat[85:]

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

torch.Size([85, 5]) torch.Size([85, 3])
torch.Size([15, 5]) torch.Size([15, 3])


# Neural Network

In [7]:
class Layer:
    def __init__(self):
        self.is_trainable = False

    def forward(self):
        pass

    def construct(self):
        pass

    def backpro(self):
        pass

## perceptron or full conect layer

Parameters of the $l$-th layer:
$$
\begin{align*}
\mathbf{W}^{(l)} &\in \mathbb{R}^{Q^{(l-1)} \times Q^{(l)}} \\
\mathbf{b}^{(l)} &\in \mathbb{R}^{Q^{(l)}}
\end{align*}
$$
where $Q^{(l)}$ is the number of units of $l$-th layer,
and $Q^{(l-1)}$ is the number of units from past layer.

Weighted sum <br>
$$
\mathbf{Z}^{(l)}(\sigma^{(l-1)}) = 
\sigma^{(l-1)} \mathbf{W}^{(l)} + \mathbf{b}^{(l)} \\
\mathbf{Z}^{(l)} : \mathbb{R}^{M \times Q^{(l-1)}} \rightarrow
\mathbb{R}^{M \times Q^{(l)}}
$$
where output from past layer $\sigma^{(l-1)} \in \mathbb{R}^{M \times Q^{(l-1)}}$

the activation function $l$-th
$$
\sigma^{(l)}(\mathbf{Z}^{(l)}) = f(\mathbf{Z}^{(l)}) \\
\sigma^{(l)} : \mathbb{R}^{M \times Q^{(l)}} \rightarrow
\mathbb{R}^{M \times Q^{(l)}}
$$
where $f$ is an arbitrary activation function.

In [8]:
class ActivationFunction(Layer):
    def __call__(self, z: torch.Tensor) -> torch.Tensor:
        pass

    def backpro(self, delta: torch.Tensor, 
                a: torch.Tensor) -> torch.Tensor:
        pass

In [9]:
class DenseLayer(Layer):
    def __init__(self, units: int, act_f: ActivationFunction):
        self.is_trainable = True
        self.units = units
        self.actf = act_f

    def copy_param(self, b: np.array, w: np.array):
        self.w.copy_(torch.tensor(w))
        self.b.copy_(torch.tensor(b))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = torch.matmul(x, self.w) + self.b
        return self.actf(z)

    def construct(self, x: torch.Tensor) -> torch.Tensor:
        n = x.shape[-1]
        self.w = torch.randn(n, self.units)
        self.b = torch.randn(self.units)
        return self.forward(x)
    
    def numel(self):
        return self.b.shape, self.w.shape, self.b.numel() + self.w.numel()

Backpropagation
$$
\frac{\partial L}{\partial \mathbf{W^{(l)}}} = \delta 
\frac{\partial \sigma^{(l)}}{\partial \mathbf{Z^{(l)}}}
\frac{\partial \mathbf{Z^{(l)}}}{\partial \mathbf{W^{(l)}}}
$$
and
$$
\frac{\partial L}{\partial \mathbf{b^{(l)}}} = \delta 
\frac{\partial \sigma^{(l)}}{\partial \mathbf{Z^{(l)}}}
\frac{\partial \mathbf{Z^{(l)}}}{\partial \mathbf{b^{(l)}}}
$$

New derivative
$$
\frac{\partial \mathbf{Z^{(l)}}}{\partial \sigma^{(l-1)}} \in
\mathbb{R}^{(M \times Q^{(l)}) \times (M \times Q^{(l-1)})}
$$
where
$$
\sigma^{(l-1)} \mathbf{W}^{(l)} + \mathbf{b}^{(l)} =
\begin{bmatrix}
    \sigma_{1}^T \mathbf{w}_{1} + b_{1}^{(l)} &
    \sigma_{1}^T \mathbf{w}_{2} + b_{2}^{(l)} &
    \cdots &
    \sigma_{1}^T \mathbf{w}_{Q^{(l)}} + b_{Q^{(l)}}^{(l)} \\
    \sigma_{2}^T \mathbf{w}_{1} + b_{1}^{(l)} &
    \sigma_{2}^T \mathbf{w}_{2} + b_{2}^{(l)} &
    \cdots &
    \sigma_{2}^T \mathbf{w}_{Q^{(l)}} + b_{Q^{(l)}}^{(l)} \\
    \vdots & \vdots & \ddots & \vdots \\
    \sigma_{M}^T \mathbf{w}_{1} + b_{1}^{(l)} &
    \sigma_{M}^T \mathbf{w}_{2} + b_{2}^{(l)} &
    \cdots &
    \sigma_{M}^T \mathbf{w}_{Q^{(l)}} + b_{Q^{(l)}}^{(l)}
\end{bmatrix}
$$
where $\sigma_{p}^T = \begin{bmatrix} \sigma^{(l-1)}_{p1} & \sigma^{(l-1)}_{p2} & \cdots & \sigma^{(l-1)}_{pQ^{(l-1)}}\end{bmatrix}$ and $\mathbf{w}_{q} = \begin{bmatrix} w_{1q}^{(l)} & w_{2q}^{(l)} & \cdots & w_{Q^{(l-1)}q}^{(l)}\end{bmatrix}^T$. <br>
therefore
$$
\frac{\partial (\sigma^{(l-1)} \mathbf{W}^{(l)} + \mathbf{b}^{(l)})_{pq}}
{\partial \sigma^{(l-1)}_{ij}} =
\frac{\partial \sigma_{p}^T \mathbf{w}_{q}}{\partial \sigma^{(l-1)}_{ij}} = \begin{cases}
    w_{jq} & \text{ if } p = i \\
    0 & \text{ if } p \neq i
\end{cases}
$$
for all $p,i = 1, ..., M$, $q = 1, ..., Q^{(l)}$ adnd $j = 1, ..., Q^{(l-1)}$.

Vectorized form:
$$
\frac{\partial \mathbf{Z^{(l)}}}{\partial \sigma^{(l-1)}} =
\mathbf{W}^{(l)^T} \otimes \mathbb{I}
$$
where $\otimes$ is Kronecker product.

In [10]:
@add_to_class(DenseLayer)
def backpro(self, delta: torch.Tensor, out : torch.Tensor,
            input_: torch.Tensor, lr: float):
    m = len(input_)

    # activation derivative
    delta = self.actf.backpro(delta, out)

    # weighted sum derivative
    identity = torch.eye(self.units)
    w_der = torch.kron(input_.unsqueeze(1).unsqueeze(3), 
                       identity.unsqueeze(0).unsqueeze(2))
    w_der = torch.einsum('pq,pqij->ij', delta, w_der)
    self.w -= lr * w_der
    self.b -= lr * delta.sum(axis=0)

    # rest of layers
    identity = torch.eye(m).unsqueeze(1).unsqueeze(3)
    w_der = torch.kron(self.w.T.unsqueeze(0).unsqueeze(2),
                       identity)
    delta = torch.einsum('pq,pqij->ij', delta, w_der)
    return delta

## activation functions

### softmax function

In [11]:
class Softmax(ActivationFunction):
    def __call__(self, z):
        z_exp = torch.exp(z)
        return z_exp / z_exp.sum(1, keepdims=True)

    def backpro(self, delta, a):
        m, q = a.shape
        diag_a = torch.diag_embed(a)
        outer_a = torch.einsum('ij,ik->ijk', a, a)
            
        sus_a = diag_a - outer_a
            
        soft_der = torch.zeros((m, q, m, q), dtype=a.dtype)
        idx = torch.arange(m)
        soft_der[idx, :, idx, :] = sus_a
        return torch.einsum('pq,pqij->ij', delta, soft_der)

### sigmoid function

sigmoid funcion
$$
\text{sigmoid}(\mathbf{Z}) = \frac{1}{1 + \exp{(-\mathbf{Z})}} \\
\text{sigmoid} : \mathbb{R}^{M \times Q} \rightarrow
\mathbb{R}^{M \times Q}
$$
where $\exp(-\mathbf{Z})$ is element-wise power $\exp(-z_{ij})$.

sigmoid derivative
$$
\frac{\partial \text{sigmoid}}{\partial \mathbf{Z}} \in
\mathbb{R}^{(M \times Q) \times (M \times Q)}
$$
where
$$
\text{sigmoid}(\mathbf{Z}) = \begin{bmatrix}
    \text{sigmoid}(z_{11}) & 
    \text{sigmoid}(z_{12}) &
    \cdots &
    \text{sigmoid}(z_{1Q}) \\
    \text{sigmoid}(z_{21}) & 
    \text{sigmoid}(z_{22}) &
    \cdots &
    \text{sigmoid}(z_{2Q}) \\
    \vdots & \vdots & \ddots & \vdots \\
    \text{sigmoid}(z_{M1}) & 
    \text{sigmoid}(z_{M2}) &
    \cdots &
    \text{sigmoid}(z_{MQ}) \\
\end{bmatrix}
$$
therefore
$$
\frac{\partial \text{sigmoid}(z_{pq})}{\partial z_{ij}} =
\begin{cases}
    \text{sigmoid}(z_{pq})
    (1 - \text{sigmoid}(z_{pq})) & \text{ if } p=i, q=j \\ 
    0 & \text{ otherwise} 
\end{cases}
$$
for all $p,i = 1, ..., M$ and $q,j = 1, ..., Q$.

In [12]:
class Sigmoid(ActivationFunction):
    def __call__(self, z):
        return 1 / (1 + torch.exp(-z))
    
    def backpro(self, delta, a):
        m, n = a.shape
        der = a * (1 - a)
        result = torch.zeros((m, n, m, n), dtype=a.dtype)
        idx = torch.arange(m), torch.arange(n)
        result[idx[0][:, None], idx[1], idx[0][:, None], idx[1]] = der
        return torch.einsum('pq,pqij->ij', delta, result)

### ReLU

ReLU function
$$
\text{ReLU}(\mathbf{Z}) = \max(\mathbf{Z}, 0) \\
\text{ReLU} : \mathbb{R}^{M \times Q} \rightarrow
\mathbb{R}^{M \times Q}
$$
where $\max(\mathbf{Z}, 0)$ is element-wise $\max(z_{ij}, 0)$.

ReLU derivative
$$
\frac{\partial \text{ReLU}}{\partial \mathbf{Z}} \in
\mathbb{R}^{(M \times Q) \times (M \times Q)}
$$
then
$$
\frac{\partial \max(z_{ij},0)}{\partial z_{ij}} =
\begin{cases}
    1 & \text{ if } z_{ij} > 0 \\
    0 & \text{ otherwise}
\end{cases}
$$
therefore
$$
\frac{\partial \text{ReLU}_{pq}}{\partial z_{ij}} =
\frac{\partial \max(z_{pq}, 0)}{\partial z_{ij}} =
\begin{cases}
    \frac{\partial \max(z_{ij},0)}{\partial z_{ij}} & \text{ if } p=i, q=j \\
    0 & \text{ otherwise}
\end{cases}
$$

In [13]:
class Relu(ActivationFunction):
    def __call__(self, z):
        return torch.max(z, torch.zeros_like(z))

    def backpro(self, delta, a):
        m, n = a.shape
    
        #relu_der = (a > 0).float()
        relu_der = (a > 0).double()
        result = torch.zeros((m, n, m, n), dtype=a.dtype)
        
        idx = torch.arange(m), torch.arange(n)
        result[idx[0][:, None], idx[1], idx[0][:, None], idx[1]] = relu_der
        return torch.einsum('pq,pqij->ij', delta, result)

## Loss functions

In [14]:
class Loss(Layer):
    def __call__(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> float:
        pass

    def backpro(self, y_pred: torch.Tensor, 
                y_true: torch.Tensor) -> torch.Tensor:
        pass

### MSE

In [15]:
class MSE(Loss):
    def __call__(self, y_pred, y_true):
        e = y_pred - y_true
        return (e**2).sum().item() / len(y_true)
    
    def backpro(self, y_pred, y_true):
        return 2 * (y_pred - y_true) / len(y_true)

### CE

In [16]:
class CE(Loss):
    def __call__(self, y_pred, y_true):
        loss = y_true * torch.log(y_pred)
        return - loss.sum().item() / len(y_true)

    def backpro(self, y_pred, y_true):
        return -(y_true / y_pred) / len(y_true)

## Other useful layer

In [17]:
class InputLayer(Layer):
    def __init__(self, n_features: int):
        self.n_features = n_features
        self.m_samples = 10

    def construct(self):
        x_fake = torch.randn(self.m_samples, self.n_features)
        return x_fake

## Sequential

In [18]:
class Model:
    def __init__(self, layers: list[Layer], loss: Loss):
        self.layers = layers[1:]
        self.loss = loss

        data = layers[0].construct()
        for l in self.layers:
            data = l.construct(data)

    def summary(self):
        num_params = 0
        for i,l in enumerate(self.layers):
            if not l.is_trainable: continue
            numel = l.numel()
            print(f'layer[{i}] - {numel[:-1]} - total: {numel[-1]}')
            num_params += numel[-1]
        print('\nTotal parameters:')
        print(f'\ttotal: {num_params}')

    def copy_params(self, tf_weights: list):
        for l in self.layers:
            if not l.is_trainable: continue
            l.copy_param(tf_weights.pop(1).numpy(),
                         tf_weights.pop(0).numpy())


### Forward propagation algorithm

$$
\begin{array}{l}
\sigma^{(0)} := \mathbf{X} \\
\textbf{for } l=1 \text{ to } L \textbf{ do} \\
\quad \mathbf{Z}^{(l)} = 
\sigma^{(l-1)} \mathbf{W}^{(l)} + \mathbf{b}^{(l)} \\
\quad \sigma^{(l)} = f(\mathbf{Z}^{(l)}) \\
\textbf{end for}
\end{array}
$$
where $L$ is the number of layers in the Model. <br>
Consequently, the prediction is:
$$
\mathbf{\hat{Y}} = \sigma^{(L)}
$$

In [19]:
@add_to_class(Model)
def predict(self, x: torch.Tensor) -> torch.Tensor:
    out = x.clone()
    for l in self.layers:
        out = l.forward(out)
    return out

In [20]:
@add_to_class(Model)
def __forward__(self, x: torch.Tensor) -> torch.Tensor:
    self.outs = [x.clone()]
    for l in self.layers:
        self.outs.append(l.forward(self.outs[-1]))

### Evaluate

In [21]:
@add_to_class(Model)
def evaluate(self, x: torch.Tensor, y: torch.Tensor) -> float:
    y_pred = self.predict(x)
    return self.loss(y_pred, y)

### Backpropagation

$$
\begin{array}{l}
\delta := \frac{\partial L}{\partial \sigma^{(L)}} \\
\textbf{for } l = L \text{ to } 1 \textbf{ do} \\
\quad \delta := \delta
\frac{\partial \sigma^{(l)}}{\partial \mathbf{Z}^{(l)}} \\
\quad
\frac{\partial L}{\partial \theta^{(l)}} =
\delta \frac{\partial \mathbf{Z}^{(l)}}{\partial \theta^{(l)}} \\
\quad \delta := \delta
\frac{\partial \mathbf{Z}^{(l)}}{\partial \sigma^{(l-1)}} \\
\textbf{end for}
\end{array}
$$
where $\theta^{(l)} = (\mathbf{b}^{(l)}, \mathbf{W}^{(l)})$.

In [22]:
@add_to_class(Model)
def update(self, x: torch.Tensor, y_true: torch.Tensor,
           lr: float):
    delta = self.loss.backpro(self.outs[-1], y_true)

    for l in reversed(self.layers):
        delta = l.backpro(delta, self.outs.pop(), self.outs[-1], lr)

### Fit

In [40]:
@add_to_class(Model)
def fit(self, x_train: torch.Tensor, y_train: torch.Tensor,
        epochs: int, lr: float, batch_size: int,
        x_valid: torch.Tensor, y_valid: torch.Tensor):
    for epoch in range(epochs):
        loss = 0
        num_batch = 0
        for batch in range(0, len(y_train), batch_size):
            num_batch += 1
            x_b = x_train[batch:batch+batch_size]
            y_b = y_train[batch:batch+batch_size]

            self.__forward__(x_b)
            loss += self.loss(self.outs[-1], y_b)

            self.update(x_b, y_b, lr)

        loss = round(loss / num_batch, 4)
        loss_v = round(self.evaluate(x_valid, y_valid), 4)
        print(f'epoch: {epoch} - L: {loss} - L_v: {loss_v}')

# Scratch vs TF

## hyperparameters

In [41]:
LR = 0.01
EPOCHS = 16
BATCH = len(X_train) // 3

## models

### TF model

In [42]:
TFModel = tf.keras.Sequential([
    tf.keras.layers.Dense(units=10, activation='sigmoid'),
    tf.keras.layers.Dense(units=7, activation='relu'),
    tf.keras.layers.Dense(units=N_CLASS, activation='softmax')
])

TFModel.compile(
    loss = tf.keras.losses.CategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.SGD(learning_rate=LR),
    metrics = [tf.keras.metrics.Accuracy()]
)

TFModel.evaluate(X_train[:1], Y_train[:1])

TFModel.summary()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 1.0874


### Scratch model

In [43]:
model = Model([InputLayer(N),
               DenseLayer(10, Sigmoid()),
               DenseLayer(7, Relu()),
               DenseLayer(N_CLASS, Softmax())],
               CE())

model.copy_params(TFModel.weights[:])
model.summary()

layer[0] - (torch.Size([10]), torch.Size([5, 10])) - total: 60
layer[1] - (torch.Size([7]), torch.Size([10, 7])) - total: 77
layer[2] - (torch.Size([3]), torch.Size([7, 3])) - total: 24

Total parameters:
	total: 161


## comparison

In [44]:
def error(tensor_true, tensor_pred) -> float:
    """
     Calculates the percentage error between two tensors or floats.

     If the arguments are simple floats or ints, calculate the percentage error between them.
     If the arguments are Numpy ndarray and PyTorch tensor, calculate the percentage error between them.
     If the argumens are PyTorch tensors, calculate the percentage error between them.

     Args:
         tensor_true: The true tensor or true float.
         pred_tensor: The predicted tensor or the predicted float.

     Returns:
         The percentage error between the tensors or floats.
     """
    if isinstance(tensor_true, (float, int)) and isinstance(tensor_pred, (float, int)):
        return np.abs(tensor_true - tensor_pred) / np.abs(tensor_true) * 100
    elif type(tensor_true) is np.ndarray and type(tensor_pred) is torch.Tensor:
        e = np.abs(tensor_true - tensor_pred.numpy()) / np.abs(tensor_true)
        return np.mean(e) * 100
    e = torch.abs(tensor_true - tensor_pred) / torch.abs(tensor_true)
    return torch.mean(e) * 100

### predict

In [45]:
tf_predict = TFModel.predict(X_valid, batch_size=len(X_valid))
predict = model.predict(X_valid)

error(tf_predict, predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step


8.406321093568787e-15

### loss

In [46]:
tf_predict = TFModel.evaluate(X_train, Y_train, batch_size=len(X_train))[0]
predict = model.evaluate(X_train, Y_train)

error(tf_predict, predict)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.0000e+00 - loss: 1.1178


3.972987550739853e-14

### fit

In [47]:
TFModel.fit(X_train, Y_train, batch_size=BATCH, epochs=EPOCHS,
            shuffle=False, validation_data=(X_valid, Y_valid))

Epoch 1/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 93ms/step - accuracy: 0.0000e+00 - loss: 1.1153 - val_accuracy: 0.0000e+00 - val_loss: 1.1336
Epoch 2/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.0000e+00 - loss: 1.1140 - val_accuracy: 0.0000e+00 - val_loss: 1.1336
Epoch 3/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.0000e+00 - loss: 1.1127 - val_accuracy: 0.0000e+00 - val_loss: 1.1336
Epoch 4/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.0000e+00 - loss: 1.1116 - val_accuracy: 0.0000e+00 - val_loss: 1.1336
Epoch 5/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.0000e+00 - loss: 1.1104 - val_accuracy: 0.0000e+00 - val_loss: 1.1333
Epoch 6/16
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.0000e+00 - loss: 1.1092 - val_accuracy: 0.0000e+00 - val_loss: 1.133

<keras.src.callbacks.history.History at 0x246bda19ad0>

In [48]:
model.fit(X_train, Y_train, EPOCHS, LR, BATCH, X_valid, Y_valid)

epoch: 0 - L: 1.1128 - L_v: 1.1336
epoch: 1 - L: 1.1098 - L_v: 1.1336
epoch: 2 - L: 1.1066 - L_v: 1.1336
epoch: 3 - L: 1.1037 - L_v: 1.1336
epoch: 4 - L: 1.1009 - L_v: 1.1333
epoch: 5 - L: 1.0981 - L_v: 1.1331
epoch: 6 - L: 1.0954 - L_v: 1.1329
epoch: 7 - L: 1.0927 - L_v: 1.1327
epoch: 8 - L: 1.0901 - L_v: 1.1326
epoch: 9 - L: 1.0876 - L_v: 1.1325
epoch: 10 - L: 1.0852 - L_v: 1.1324
epoch: 11 - L: 1.0829 - L_v: 1.1324
epoch: 12 - L: 1.0807 - L_v: 1.1323
epoch: 13 - L: 1.0786 - L_v: 1.1323
epoch: 14 - L: 1.0765 - L_v: 1.1323
epoch: 15 - L: 1.0745 - L_v: 1.1324
