In [1]:
import torch
import autograd.numpy as np
from autograd import jacobian
from tensorflow.keras.layers import Softmax
from tensorflow.keras.config import set_floatx

In [2]:
set_floatx('float64')
torch.set_default_dtype(torch.float64)

# Errors 
Mean Percentage Error

In [3]:
def error(a_true: np.ndarray, a_pred: torch.Tensor) -> float:
    e = np.abs(a_true - a_pred.numpy()) / np.abs(a_true)
    return np.mean(e) * 100

# Softmax function

## one example/entry

In [4]:
Q = 5 # number of classes

Z = torch.randint(-20, 21, (1, Q)) / 2
Z

tensor([[-4.0000, -0.5000,  7.0000,  0.5000,  5.5000]])

We will represent $\text{softmax}$ like $\sigma$.
$$
\sigma(\mathbf{z})_{j} = \frac{\exp (\mathbf{z})_{j}}{\sum_{k=1}^{Q} \exp (z_{k})}
$$
where $Q$ is the number of classes

In [5]:
SOFTMAX = Softmax()
tf_soft_1 = SOFTMAX(Z)
tf_soft_1.numpy()

array([[1.36317782e-05, 4.51422496e-04, 8.16191019e-01, 1.22709357e-03,
        1.82116833e-01]])

In [6]:
# our softmax funcion
def softmax_1(z: torch.Tensor) -> torch.Tensor:
    exp = torch.exp(z)
    return exp / exp.sum()

In [7]:
my_soft_1 = softmax_1(Z)
my_soft_1.numpy()

array([[1.36317782e-05, 4.51422496e-04, 8.16191019e-01, 1.22709357e-03,
        1.82116833e-01]])

In [8]:
# error
error(tf_soft_1.numpy(), my_soft_1)

7.607709770028611e-15

## multiple examples/entrys

In [9]:
M = 100 # number of examples/entrys

Z = torch.randint(-20, 21, (M, Q)) / 2
Z.shape

torch.Size([100, 5])

$$
\mathbf{Z} = \begin{bmatrix}
    \mathbf{z}_{1,:} \\
    \mathbf{z}_{2,:} \\
    \vdots \\
    \mathbf{z}_{M,:}
\end{bmatrix}
$$
then its softmax over each row is like
$$
\sigma(\mathbf{Z}) = \begin{bmatrix}
    \sigma(\mathbf{z}_{1,:}) \\
    \sigma(\mathbf{z}_{2,:}) \\
    \vdots \\
    \sigma(\mathbf{z}_{M,:}) \\
\end{bmatrix}
$$

In [10]:
tf_soft_2 = SOFTMAX(Z)
tf_soft_2.shape

TensorShape([100, 5])

In [11]:
# our function
def softmax_2(z: torch.Tensor) -> torch.Tensor:
    exp = torch.exp(z)
    return exp / exp.sum(dim=-1, keepdims=True)

my_soft_2 = softmax_2(Z)
my_soft_2.shape

torch.Size([100, 5])

In [12]:
error(tf_soft_2.numpy(), my_soft_2)

1.2248504348326294e-14

# Gradient

## derivative one softmax respect to one example

In [21]:
N_FEATURE = 3 # select one feature to derivative

Z = torch.randint(-20, 21, (1, Q)) / 2

def der_soft_1(z):
    exp = np.exp(z)
    return exp[0,N_FEATURE] / np.sum(exp)

gradient = jacobian(der_soft_1)
grad = gradient(Z.numpy())
print(grad.shape)
print(grad)

(1, 5)
[[-3.04374453e-07 -3.42528322e-14 -8.38139170e-12  3.05137337e-07
  -7.54468837e-10]]


$$
\frac{\partial \sigma(\mathbf{z})_{j}}{\partial \mathbf{z}}
\in \mathbb{R} \times (1 \times Q) \Leftrightarrow 1 \times Q
$$
because $\mathbf{z} \in 1 \times Q$ and $\sigma(\mathbf{z})_{j} \in \mathbb{R}$. <br>
Then its jacobian in **Numerator layout** is:
$$
\frac{\partial \sigma(\mathbf{z})_{j}}{\partial \mathbf{z}} =
\begin{bmatrix}
    \frac{\partial \sigma(\mathbf{z})_{j}}{\partial z_{1}} &
    \frac{\partial \sigma(\mathbf{z})_{j}}{\partial z_{2}} &
    \cdots &
    \frac{\partial \sigma(\mathbf{z})_{j}}{\partial z_{Q}}
\end{bmatrix}
$$
there are two different types of the derivatives:
1. $\frac{\partial \sigma(\mathbf{z})_{j}}{\partial z_{i=j}}$

2. $\frac{\partial \sigma(\mathbf{z})_{j}}{\partial z_{i\neq j}}$

First case:
$$
\frac{\partial \sigma(\mathbf{z})_{j}}{\partial z_{i=j}} = 
\sigma(\mathbf{z})_{j} (1 - \sigma(\mathbf{z})_j)
$$

Second case:
$$
\frac{\partial \sigma(\mathbf{z})_{j}}{\partial z_{i\neq j}} =
-\sigma(\mathbf{z})_{j} \sigma(\mathbf{z})_{i}
$$

Therefore:
$$
\frac{\partial \sigma(\mathbf{z})_{j}}{\partial \mathbf{z}} =
\begin{bmatrix}
    -\sigma(\mathbf{z})_{j} \sigma(\mathbf{z})_{1} &
    \cdots &
    \sigma(\mathbf{z})_{j}(1 - \sigma(\mathbf{z})_j) &
    \cdots &
    -\sigma(\mathbf{z})_{j} \sigma(\mathbf{z})_{Q}
\end{bmatrix}
$$
or as vectorized form:
$$
\frac{\partial \sigma(\mathbf{z})_{j}}{\partial \mathbf{z}} =
\sigma(\mathbf{z})_j \odot
\begin{bmatrix}
    -\sigma(\mathbf{z})_{1} &
    \cdots &
    1 - \sigma(\mathbf{z})_{j} &
    \cdots &
    -\sigma(\mathbf{z})_{Q}
\end{bmatrix}
$$

In [27]:
def my_der_soft_1(z, j):
    soft = softmax_1(z)
    soft_j = soft[0,j].item()
    soft *= -1
    soft[0,j] += 1
    return soft_j * soft

my_grad = my_der_soft_1(Z, N_FEATURE)
my_grad.numpy()

array([[-3.04374453e-07, -3.42528322e-14, -8.38139170e-12,
         3.05137337e-07, -7.54468837e-10]])

In [28]:
error(grad, my_grad)

7.16346590378772e-15

## derivative of multiple softmax respecto to one example/entry

In [29]:
try:
    del gradient, grad
except: pass

In [30]:
Z = torch.randint(-20, 21, (1, Q)) / 2

def der_soft_2(z):
    exp = np.exp(z)
    return exp / np.sum(exp)

gradient = jacobian(der_soft_2)
grad = gradient(Z.numpy())
print(grad.shape)
grad

(1, 5, 1, 5)


array([[[[ 1.35476576e-01, -1.58367140e-02, -4.78227326e-04,
          -1.17018368e-01, -2.14326618e-03]],

        [[-1.58367140e-02,  8.84019584e-02, -2.90059535e-04,
          -7.09752282e-02, -1.29995665e-03]],

        [[-4.78227326e-04, -2.90059535e-04,  2.95080833e-03,
          -2.14326618e-03, -3.92552894e-05]],

        [[-1.17018368e-01, -7.09752282e-02, -2.14326618e-03,
           1.99742315e-01, -9.60545261e-03]],

        [[-2.14326618e-03, -1.29995665e-03, -3.92552894e-05,
          -9.60545261e-03,  1.30879307e-02]]]])

$$
\frac{\partial \sigma(\mathbf{z})}{\partial \mathbf{z}} \in
(1 \times Q) \times (1 \times Q)
$$
but to simplify we will use an easier notation:
$$
\Rightarrow
\frac{\partial \sigma(\mathbf{z})}{\partial \mathbf{z}} \in 
Q \times Q
$$
we will ignore the 1's axes for now. <br>
The derivative is like:
$$
\begin{align*}
\frac{\partial \sigma(\mathbf{z})}{\partial \mathbf{z}} &=
\begin{bmatrix}
    \frac{\partial \sigma(\mathbf{z})_{1}}{\partial \mathbf{z}} \\
    \frac{\partial \sigma(\mathbf{z})_{2}}{\partial \mathbf{z}} \\
    \vdots \\
    \frac{\partial \sigma(\mathbf{z})_{Q}}{\partial \mathbf{z}}
\end{bmatrix} \\
&= \begin{bmatrix}
    \frac{\partial \sigma(\mathbf{z})_{1}}{\partial z_{1}} &
    \frac{\partial \sigma(\mathbf{z})_{1}}{\partial z_{2}} &
    \cdots &
    \frac{\partial \sigma(\mathbf{z})_{1}}{\partial z_{Q}} \\
    \frac{\partial \sigma(\mathbf{z})_{2}}{\partial z_{1}} &
    \frac{\partial \sigma(\mathbf{z})_{2}}{\partial z_{2}} &
    \cdots &
    \frac{\partial \sigma(\mathbf{z})_{2}}{\partial z_{Q}} \\
    \vdots & \vdots & \ddots & \vdots \\
    \frac{\partial \sigma(\mathbf{z})_{Q}}{\partial z_{1}} &
    \frac{\partial \sigma(\mathbf{z})_{Q}}{\partial z_{2}} &
    \cdots &
    \frac{\partial \sigma(\mathbf{z})_{Q}}{\partial z_{Q}} \\
\end{bmatrix}
\end{align*}
$$
then using this derivatives:
$$
\begin{align*}
\frac{\partial \sigma(\mathbf{z})_{j}}{\partial z_{i=j}} &= 
\sigma(\mathbf{z})_{j} (1 - \sigma(\mathbf{z})_j) \\
\frac{\partial \sigma(\mathbf{z})_{j}}{\partial z_{i\neq j}} &=
-\sigma(\mathbf{z})_{j} \sigma(\mathbf{z})_{i}
\end{align*}
$$
therefore:
$$
\frac{\partial \sigma(\mathbf{z})}{\partial \mathbf{z}} =
\begin{bmatrix}
    \sigma(\mathbf{z})_{1} (1 - \sigma(\mathbf{z})_1) &
    -\sigma(\mathbf{z})_{1} \sigma(\mathbf{z})_{2} &
    \cdots &
    -\sigma(\mathbf{z})_{1} \sigma(\mathbf{z})_{Q} \\
    -\sigma(\mathbf{z})_{2} \sigma(\mathbf{z})_{1} &
    \sigma(\mathbf{z})_{2} (1 - \sigma(\mathbf{z})_2) &
    \cdots &
    -\sigma(\mathbf{z})_{2} \sigma(\mathbf{z})_{Q} \\
    \vdots & \vdots & \ddots & \vdots \\
    -\sigma(\mathbf{z})_{Q} \sigma(\mathbf{z})_{1} &
    -\sigma(\mathbf{z})_{Q} \sigma(\mathbf{z})_{2} &
    \cdots &
    \sigma(\mathbf{z})_{Q} (1 - \sigma(\mathbf{z})_Q)
\end{bmatrix}
$$
or as vectorized form:
$$
\frac{\partial \sigma(\mathbf{z})}{\partial \mathbf{z}} =
\text{diag}(\sigma(\mathbf{z})) - \sigma(\mathbf{z}) \sigma(\mathbf{z})^T
$$

In [42]:
def my_der_soft_2(z):
    soft = softmax_1(z)[0,:] # is necessary for Pytorch to work
    return torch.diag(soft) - torch.outer(soft, soft)

my_grad = my_der_soft_2(Z)
my_grad.numpy()

array([[ 1.35476576e-01, -1.58367140e-02, -4.78227326e-04,
        -1.17018368e-01, -2.14326618e-03],
       [-1.58367140e-02,  8.84019584e-02, -2.90059535e-04,
        -7.09752282e-02, -1.29995665e-03],
       [-4.78227326e-04, -2.90059535e-04,  2.95080833e-03,
        -2.14326618e-03, -3.92552894e-05],
       [-1.17018368e-01, -7.09752282e-02, -2.14326618e-03,
         1.99742315e-01, -9.60545261e-03],
       [-2.14326618e-03, -1.29995665e-03, -3.92552894e-05,
        -9.60545261e-03,  1.30879307e-02]])

In [44]:
error(grad[0,:,0,:], my_grad)

6.896657617298521e-15

## derivative of multiple softmax respect to multiple examples

### Problem statement

$$
\mathbf{Z} \in M \times Q
$$
where $M$ is the number of examples. Then softmax function is:
$$
\sigma(\mathbf{Z}) = \begin{bmatrix}
    \sigma(\mathbf{Z}_{1,:}) \\
    \sigma(\mathbf{Z}_{2,:}) \\
    \vdots \\
    \sigma(\mathbf{Z}_{M,:})
\end{bmatrix} \in M \times Q
$$
where
$$
\mathbf{Z}_{p,:} = \begin{bmatrix}
    Z_{p1} & Z_{p2} & \cdots & Z_{pQ}
\end{bmatrix} \in 1 \times Q
$$
for all $p = 1, ..., M$. Therefore
$$
\sigma(\mathbf{Z}_{p,:}) = \begin{bmatrix}
    \sigma(\mathbf{Z}_{p,:})_{1} & 
    \sigma(\mathbf{Z}_{p,:})_{2} & 
    \cdots & 
    \sigma(\mathbf{Z}_{p,:})_{Q}
\end{bmatrix} \in 1 \times Q
$$

### derivative

In [45]:
try:
    del gradient, grad
except: pass

In [46]:
Z = torch.randint(-20, 21, (M, Q)) / 2

def der_soft_3(z):
    exp = np.exp(z)
    return exp / np.sum(exp, axis=-1, keepdims=True)

gradient = jacobian(der_soft_3)
grad = gradient(Z.numpy())
print(grad.shape)

(100, 5, 100, 5)
