In [1]:
import numpy
import torch
import tensorflow as tf

In [2]:
torch.set_default_dtype(torch.float64)
tf.keras.config.set_floatx('float64')

In [3]:
def add_to_class(Class):  
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

# Dataset

## create dataset

In [4]:
from sklearn.datasets import make_classification

M: int = 100
N: int = 5
N_CLASS: int = 3

X, Y = make_classification(n_samples=M, n_features=N, n_classes=N_CLASS,
                           n_informative=N-1, n_redundant=0)

print(X.shape)
print(Y.shape)

(100, 5)
(100,)


## one hot encoding

In [5]:
Y_hat = torch.nn.functional.one_hot(torch.tensor(Y).long(), 3)
Y_hat.shape

torch.Size([100, 3])

## split dataset into train and valid

In [6]:
X_train, X_valid = torch.tensor(X[:85]), torch.tensor(X[85:])
Y_train, Y_valid = Y_hat[:85], Y_hat[85:]

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

torch.Size([85, 5]) torch.Size([85, 3])
torch.Size([15, 5]) torch.Size([15, 3])


# Model

## weights and bias

Trainables parameters
$$
\begin{align*}
\mathbf{W} &\in \mathbb{R}^{N \times Q} \\
\mathbf{b} &\in \mathbb{R}^{Q}
\end{align*}
$$
where $N$ is the number of features and $Q$ is the number of classes.

In [7]:
class SoftmaxClassifier:
    def __init__(self, n_features: int, n_classes: int):
        self.w = torch.randn(n_features, n_classes)
        self.b = torch.randn(n_classes)

    def copy_params(self, tf_model) -> None:
        """Copy the parameters from a TensorFlow model to this PyTorch model.

        Args:
            tf_model: A TensorFlow model from which to copy the parameters.

        Returns:
            None
        """
        self.w.copy_(torch.tensor(tf_model.weights[0].numpy()[:,0]))
        self.b.copy_(torch.tensor(tf_model.weights[1].numpy()))

## weighted sum and softmax function

weighted sum
$$
\mathbf{Z}(\mathbf{X}) = \mathbf{X} \mathbf{W} + \mathbf{b} \\
\mathbf{Z} : \mathbb{R}^{M \times N} \rightarrow \mathbb{R}^{M \times Q}
$$

softmax function
$$
\sigma(\mathbf{Z}_{i,:})_{j} = \frac{\exp(\mathbf{Z}_{i,:})_{j}}
{\sum_{k=1}^{Q}(\exp(\mathbf{Z}_{i,:})_{k})}
$$
then:
$$
\sigma(\mathbf{Z}_{i,:}) = \begin{bmatrix}
    \sigma(\mathbf{Z}_{i,:})_{1} &
    \sigma(\mathbf{Z}_{i,:})_{2} &
    \cdots &
    \sigma(\mathbf{Z}_{i,:})_{Q}
\end{bmatrix}
$$
therefore:
$$
\sigma(\mathbf{Z}) = \begin{bmatrix}
    \sigma(\mathbf{Z}_{1,:}) \\
    \sigma(\mathbf{Z}_{2,:}) \\
    \vdots \\
    \sigma(\mathbf{Z}_{M,:})
\end{bmatrix} \\
\sigma(\mathbf{Z}) : \mathbb{R}^{M \times Q} \rightarrow \mathbb{R}^{M \times Q}
$$

In [8]:
@add_to_class(SoftmaxClassifier)
def predict(self, x: torch.Tensor) -> torch.Tensor:
    # weighted sum
    z = torch.matmul(x, self.w) + self.b
    # softmax function
    z_exp = torch.exp(z)
    y_pred = z_exp / z_exp.sum(1, keepdims=True)
    return y_pred

## CE

Loss function: Cross Entropy-loss:
$$
L(\mathbf{\hat{Y}}) = - \frac{1}{M} \sum_{i=1}^{M} \sum_{k=1}^{Q}(
    Y_{ik} \log(\hat{Y}_{ik})
) \\
L(\mathbf{\hat{Y}}) : \mathbb{R}^{M \times Q} \rightarrow \mathbb{R}
$$
**Remark**: for this case $\mathbf{\hat{Y}}$ is $\sigma(\mathbf{Z})$. It is not obligatory to use softmax for CE.<br>
Vectorized form:
$$
L(\mathbf{\hat{Y}}) = - \frac{1}{M} \sum_{i=1}^{M} \left(
    \mathbf{y}_{i,:}^T \log(\mathbf{\hat{y}}_{i,:})
\right)
$$
or
$$
L(\mathbf{\hat{Y}}) = - \frac{1}{M} \sum \left(
    \mathbf{Y} \odot \log(\mathbf{\hat{Y}})
\right)
$$

In [9]:
@add_to_class(SoftmaxClassifier)
def evaluate(self, x: torch.Tensor, y_true: torch.Tensor) -> float:
    y_pred = self.predict(x)
    loss = y_true * torch.log(y_pred)
    return - loss.sum().item() / len(y_true)

## Gradient

Gradient descent is:
$$
\frac{\partial L}{\partial \mathbf{W}} =
\frac{\partial L}{\partial \sigma}
\frac{\partial \sigma}{\partial \mathbf{Z}}
\frac{\partial \mathbf{Z}}{\partial \mathbf{W}}
$$
and
$$
\frac{\partial L}{\partial \mathbf{b}} =
\frac{\partial L}{\partial \sigma}
\frac{\partial \sigma}{\partial \mathbf{Z}}
\frac{\partial \mathbf{Z}}{\partial \mathbf{b}}
$$
where their shapes are:
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{W}} &\in \mathbb{R}^{N \times Q} \\
\frac{\partial L}{\partial \mathbf{b}} &\in \mathbb{R}^{Q} \\
\frac{\partial L}{\partial \sigma} &\in \mathbb{R}^{M \times Q} \\
\frac{\partial \sigma}{\partial \mathbf{Z}} &\in \mathbb{R}^{(M \times Q) \times (M \times Q)} \\
\frac{\partial \mathbf{Z}}{\partial \mathbf{W}} &\in \mathbb{R}^{(M \times Q) \times (N \times Q)} \\
\frac{\partial \mathbf{Z}}{\partial \mathbf{b}} &\in \mathbb{R}^{(M \times Q) \times Q}
\end{align*}
$$

### weighted sum derivative

#### $\frac{\partial \mathbf{Z}}{\partial \mathbf{W}}$
$$
\mathbf{X} = \begin{bmatrix}
    x_{11} & x_{12} & \cdots & x_{1N} \\
    x_{21} & x_{22} & \cdots & x_{2N} \\
    \vdots & \vdots & \ddots & \vdots \\
    x_{M1} & x_{M2} & \cdots & x_{MN}
\end{bmatrix} = \begin{bmatrix}
    \mathbf{x}_{1}^T \\
    \mathbf{x}_{2}^T \\
    \vdots \\
    \mathbf{x}_{M}^T
\end{bmatrix} \\
\mathbf{W} = \begin{bmatrix}
    w_{11} & w_{12} & \cdots & w_{1Q} \\
    w_{21} & w_{22} & \cdots & w_{2Q} \\
    \vdots & \vdots & \ddots & \vdots \\
    w_{N1} & w_{N2} & \cdots & w_{NQ} 
\end{bmatrix} = \begin{bmatrix}
    \mathbf{w}_{1} &
    \mathbf{w}_{2} &
    \cdots &
    \mathbf{w}_{Q}
\end{bmatrix}
$$
where $\mathbf{x}_{p}^T = \begin{bmatrix} x_{p1} & x_{p2} & \cdots & x_{pN} \end{bmatrix}$ 
and $\mathbf{w}_{q} = \begin{bmatrix} w_{1q} & w_{2q} & \cdots & w_{Nq} \end{bmatrix}^T$
, then
$$
\mathbf{XW} = \begin{bmatrix}
    \mathbf{x}_{1}^T \mathbf{w}_{1} & \mathbf{x}_{1}^T \mathbf{w}_{2} & \cdots & \mathbf{x}_{1}^T \mathbf{w}_{Q} \\
    \mathbf{x}_{2}^T \mathbf{w}_{1} & \mathbf{x}_{2}^T \mathbf{w}_{2} & \cdots & \mathbf{x}_{2}^T \mathbf{w}_{Q} \\
    \vdots & \vdots & \ddots & \vdots \\
    \mathbf{x}_{M}^T \mathbf{w}_{1} & \mathbf{x}_{M}^T \mathbf{w}_{2} & \cdots & \mathbf{x}_{M}^T \mathbf{w}_{Q}
\end{bmatrix}
$$
therefore:
$$
\frac{\partial \mathbf{XW}_{pq}}{\partial w_{ij}} = 
\frac{\partial \mathbf{x}_{p}^T\mathbf{w}_{q}}{\partial w_{ij}} = \begin{cases}
    x_{pi} & \text{ if } q=j \\ 
    0 & \text{ if } q\neq j 
\end{cases}
$$
for all $p = 1, ..., M$, $q, j = 1, ..., Q$ and $i = 1, ..., N$. <br>
Vectorized form:
$$
\frac{\partial \mathbf{Z}}{\partial \mathbf{W}} = 
\mathbb{I} \otimes \mathbf{X}
$$
where $\otimes$ is Kronecker product.

#### $\frac{\partial \mathbf{Z}}{\partial \mathbf{b}}$
$$
\mathbf{XW + b} = \begin{bmatrix}
    \mathbf{x}_{1}^T \mathbf{w}_{1}+b_1 & \mathbf{x}_{1}^T \mathbf{w}_{2}+b_2 & \cdots & \mathbf{x}_{1}^T \mathbf{w}_{Q}+b_Q \\
    \mathbf{x}_{2}^T \mathbf{w}_{1}+b_1 & \mathbf{x}_{2}^T \mathbf{w}_{2}+b_2 & \cdots & \mathbf{x}_{2}^T \mathbf{w}_{Q}+b_Q  \\
    \vdots & \vdots & \ddots & \vdots \\
    \mathbf{x}_{M}^T \mathbf{w}_{1}+b_1 & \mathbf{x}_{M}^T \mathbf{w}_{2}+b_2 & \cdots & \mathbf{x}_{M}^T \mathbf{w}_{Q}+b_Q 
\end{bmatrix}
$$
therefore:
$$
\frac{\partial \mathbf{(XW+b)}_{pq}}{\partial b_{i}} = 
\frac{\partial \mathbf{x}_{p}^T\mathbf{w}_{q} + b_{q}}{\partial b_{i}} = \begin{cases}
    1 & \text{ if } q=i \\ 
    0 & \text{ if } q\neq i
\end{cases}
$$
for all $p = 1, ..., M$ and $q,i = 1, ..., Q$.

### softmax derivative summary

First case:
$$
\frac{\partial \sigma(\mathbf{Z})_{p,:}}{\partial \mathbf{Z}_{i=p,:}} = \text{diag}(\sigma(\mathbf{Z}_{p,:})) - \sigma(\mathbf{Z}_{p,:}) \sigma(\mathbf{Z}_{p,:})^T
$$

Second case:
$$
\frac{\partial \sigma(\mathbf{Z})_{p,:}}{\partial \mathbf{Z}_{i\neq p,:}} = \mathbf{0}
$$
Please check [Softmax Function and Gradient](softmax_function_and_gradient.ipynb)

### cross-entropy derivative

We will use this CE:
$$
\begin{align*}
L(\mathbf{\hat{Y}}) &= - \frac{1}{M} \sum_{i=1}^{M} \left(
    \mathbf{y}_{i,:}^T \log(\mathbf{\hat{y}}_{i,:})
\right) \\
&= -\frac{1}{M} \left(
    \mathbf{y}_{1,:}^T \log(\mathbf{\hat{y}}_{1,:}) +
    \mathbf{y}_{2,:}^T \log(\mathbf{\hat{y}}_{2,:}) +
    ... + 
    \mathbf{y}_{M,:}^T \log(\mathbf{\hat{y}}_{M,:})
\right)
\end{align*}
$$
$$
\begin{align*}
\frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{p,:}} &= \begin{bmatrix}
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{p1}} & 
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{p2}} &
    \cdots &
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{pQ}}
\end{bmatrix} \in \mathbb{R}^{1 \times Q} \\
\frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{Y}}} &= \begin{bmatrix}
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{1,:}} \\
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{2,:}} \\
    \vdots \\
    \frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{M,:}}
\end{bmatrix} \in \mathbb{R}^{M \times Q}
\end{align*}
$$
then
$$
\begin{align*}
\frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{y}}_{p,:}} &=
-\frac{1}{M} \frac{\partial}{\partial \mathbf{\hat{y}}_{p,:}} \left(
    \mathbf{y}_{1,:}^T \log(\mathbf{\hat{y}}_{1,:}) +
    ... +
    \mathbf{y}_{p,:}^T \log(\mathbf{\hat{y}}_{p,:}) +
    ... + 
    \mathbf{y}_{M,:}^T \log(\mathbf{\hat{y}}_{M,:})
\right) \\
&= -\frac{1}{M} \frac{\partial}{\partial \mathbf{\hat{y}}_{p,:}} \left(
    \mathbf{y}_{p,:}^T \log(\mathbf{\hat{y}}_{p,:})
\right) \\
&= -\frac{1}{M} \left(
    \mathbf{y}_{p,:} \oslash \mathbf{\hat{y}}_{p,:}
\right)
\end{align*}
$$
where $\oslash$ is element-wise divide. <br>
Therefore
$$
\begin{align*}
\frac{\partial L(\mathbf{\hat{Y}})}{\partial \mathbf{\hat{Y}}} &= -\frac{1}{M} \begin{bmatrix}
    \mathbf{y}_{1,:} \oslash \mathbf{\hat{y}}_{1,:} \\
    \mathbf{y}_{2,:} \oslash \mathbf{\hat{y}}_{2,:} \\
    \vdots \\
    \mathbf{y}_{M,:} \oslash \mathbf{\hat{y}}_{M,:}
\end{bmatrix} \\
&= -\frac{1}{M} \mathbf{Y} \oslash \mathbf{\hat{Y}}
\end{align*}
$$

### pull it all together

$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{Z}} &=
\frac{\partial L}{\partial \sigma}
\frac{\partial \sigma}{\partial \mathbf{Z}} \\
&\in \mathbb{R}^{{\color{Cyan} (M \times Q)} \times ({\color{Cyan} M \times Q} \times {\color{Orange} M \times Q})} \\
&\in \mathbb{R}^{\color{Orange} M \times Q}
\end{align*}
$$

#### $\frac{\partial L}{\partial \mathbf{W}}$
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{W}} &=
\frac{\partial L}{\partial \mathbf{Z}} 
\frac{\partial \mathbf{Z}}{\partial \mathbf{W}} \\
&\in \mathbb{R}^{{\color{Orange} (M \times Q)} \times ({\color{Orange} M \times Q} \times {\color{Magenta} N \times Q})} \\
&\in \mathbb{R}^{\color{Magenta} N \times Q}
\end{align*}
$$

#### $\frac{\partial L}{\partial \mathbf{b}}$
$$
\begin{align*}
\frac{\partial L}{\partial \mathbf{b}} &=
\frac{\partial L}{\partial \mathbf{Z}} 
\frac{\partial \mathbf{Z}}{\partial \mathbf{b}} \\
&\in \mathbb{R}^{{\color{Orange} (M \times Q)} \times ({\color{Orange} M \times Q} \times {\color{Magenta} Q})} \\
&\in \mathbb{R}^{\color{Magenta} Q}
\end{align*}
$$

In [10]:
@add_to_class(SoftmaxClassifier)
def update(self, x: torch.Tensor, y_true: torch.Tensor,
           y_pred: torch.Tensor, lr: float) -> None:
    m, n = x.shape
    m, n_classes = y_true.shape

    

    # weighted sum
    identity = torch.eye(n_classes)
    ## weight grad
    torch.kron(x.unsqueeze(1).unsqueeze(3), identity.unsqueeze(0).unsqueeze(2))
    ## bias grad
    identity.unsqueeze(0).expand(m, -1, -1)