In [2]:
import tensorflow as tf
import torch
import numpy as np

tf.keras.config.set_floatx('float64')
torch.set_default_dtype(torch.float64)

# Softmax function

## One example/entry

In [3]:
N = 4
X = np.random.random((1, N)) * 3
X, X.dtype

(array([[1.61124441, 1.9986418 , 2.17121789, 2.18079716]]), dtype('float64'))

In [4]:
SOFTMAX = tf.keras.layers.Softmax()
SOFTMAX(X).numpy()

array([[0.16691024, 0.24588245, 0.29219741, 0.2950099 ]])

Softmax
$$
\text{softmax}(\boldsymbol{o})_j = \frac{\exp(o)_j}{\sum_{k=1}^{q} \exp(o_k)}
$$
where $q$ is the number of clases

In [5]:
def softmax_1(o):
    expo = torch.exp(o)
    return expo / expo.sum()

softmax_1(torch.tensor(X)).numpy()

array([[0.16691024, 0.24588245, 0.29219741, 0.2950099 ]])

In [6]:
tf_res = SOFTMAX(X).numpy()
my_res = softmax_1(torch.tensor(X)).numpy()

print(np.mean(np.abs((tf_res - my_res) / tf_res)) * 100)

del tf_res
del my_res

4.7041769595653436e-15


## Softmax over matrices

In [7]:
M = 100
X2 = np.random.random((M, N)) * 4
X2.shape, X2.dtype

((100, 4), dtype('float64'))

In [8]:
def softmax_2(o):
    exp = torch.exp(o)
    return exp / torch.sum(exp, dim=-1, keepdim=True)

In [9]:
tf_res = SOFTMAX(X2).numpy()
my_res = softmax_2(torch.tensor(X2)).numpy()

print(np.mean(np.abs((tf_res - my_res) / tf_res)) * 100)

del tf_res
del my_res

1.1082225706535181e-14


# Softmax derivative

## $\frac{\partial \text{softmax}(o)_j}{\partial \boldsymbol{o}}$

The derivative one softmax output with respect input

In [10]:
POINT = 2 # Point is the features number 2

X3 = torch.rand((1, N)) * 4 + 1
X3.requires_grad_(True)

Y = torch.exp(X3[0,POINT]) / torch.sum(torch.exp(X3), dim=1, keepdim=True)
Y.backward(torch.ones_like(Y))

gradient = X3.grad

gradient.shape

torch.Size([1, 4])

$$
\frac{\partial \text{softmax}(o)_j}{\partial \boldsymbol{o}} \in (1, q)
$$

because $\boldsymbol{o} \in (1, q)$ and $\text{softmax}(o)_j \in \mathbb{R}$
<br> <br>
Then, the jacobian in **Denominator layout** is:
$$
\frac{\partial \text{softmax}(o)_j}{\partial \boldsymbol{o}} = \begin{bmatrix}
\frac{\partial \text{softmax}(o)_j}{\partial o_1} & \cdots & \frac{\partial \text{softmax}(o)_j}{\partial o_j} & \cdots & \frac{\partial \text{softmax}(o)_j}{\partial o_q}
\end{bmatrix}
$$

there are two diferente types of the derivatives:
1. $\frac{\partial \text{softmax}(o)_j}{\partial o_{i=j}}$
2. $\frac{\partial \text{softmax}(o)_j}{\partial o_{i\neq j}}$

$\frac{\partial \text{softmax}(o)_j}{\partial o_{i=j}}$

$$
\frac{\partial \text{softmax}(o)_j}{\partial o_{i=j}} = \text{softmax}(o)_j\left (1-\text{softmax}(o)_j \right)
$$

$\frac{\partial \text{softmax}(o)_j}{\partial o_{i\neq j}}$

$$
\frac{\partial \text{softmax}(o)_j}{\partial o_{i\neq j}} = -\text{softmax}(o)_j \text{softmax}(o_i)
$$

Therefore:
$$
\frac{\partial \text{softmax}(o)_j}{\partial \boldsymbol{o}} = \begin{bmatrix}
-\text{softmax}(o)_j \text{softmax}(o_1) & \cdots & \text{softmax}(o)_j\left (1-\text{softmax}(o)_j \right) & \cdots & -\text{softmax}(o)_j \text{softmax}(o_q)
\end{bmatrix}
$$

Vectorized form:
$$
\frac{\partial \text{softmax}(o)_j}{\partial \boldsymbol{o}} = \text{softmax}(o)_j \odot \begin{bmatrix}
-\text{softmax}(o_1) & \cdots & 1-\text{softmax}(o)_j & \cdots & -\text{softmax}(o_q)
\end{bmatrix}
$$

In [11]:
gradient.numpy()

array([[-0.01709643, -0.1193966 ,  0.15512242, -0.01862939]])

In [12]:
def softmax_derivative_1(o, point):
    soft = softmax_1(o) #get normal softmax function
    soft_point = soft[0, point].item() #get particular softmax(o)_j
    soft *= -1
    soft[0,point] += 1 #sum 1 to particular softmax(o)_j
    return soft_point * (soft) #Hadamard product

softmax_derivative_1(X3, POINT).detach().numpy()

array([[-0.01709643, -0.1193966 ,  0.15512242, -0.01862939]])

In [13]:
my_grad = softmax_derivative_1(X3, POINT).detach().numpy()

print(np.mean(np.abs((gradient.numpy() - my_grad) / gradient.numpy())) * 100)

del my_grad

4.65587947277312e-15


## $\frac{\partial \text{softmax}(\boldsymbol{o})}{\partial \boldsymbol{o}}$

The derivative softmax outputs with respect input

In [24]:
X4 = torch.rand((1, N)) * 4 + 1
X4.requires_grad_(True)

Y2 = torch.exp(X4) / torch.sum(torch.exp(X4), dim=1, keepdim=True)
Y2.backward(torch.ones_like(Y2))

gradient = X4.grad

gradient.shape

torch.Size([1, 4])

In [25]:
gradient.numpy()

array([[5.55111512e-17, 5.55111512e-17, 0.00000000e+00, 1.38777878e-17]])

In [None]:
t1 = 

# Appendix A

$$
\begin{align}
\boldsymbol{o}_i \in \mathbb{R} \\
\text{softmax}(\boldsymbol{o})_j \in \mathbb{R}
\end{align} 
$$

then
$$
\frac{\partial \text{softmax}(o)_j}{\partial o_{i=j}} \in \mathbb{R}
$$

Derivative for this case $i=j$:
$$
\begin{align}
\frac{\partial \text{softmax}(o)_j}{\partial o_{i=j}} &= \frac{\partial}{\partial o_j} \left(\frac{\exp(o_j)}{\sum_{k=1}^{q} \exp{o_i}} \right) \\
&= \frac{\frac{\partial}{\partial o_j}(\exp{(o_j)}) \sum_{k=1}^{q} \exp{(o_k)} -\exp{(o_j)} \frac{\partial}{\partial o_j} (\sum_{k=1}^{q} \exp{(o_k)})}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= \frac{\exp{(o_j)} (\sum_{k=1}^{q} \exp{(o_k)}) -\exp{(o_j)}^2}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= \frac{\exp{(o_j)} \left( (\sum_{k=1}^{q} \exp{(o_k)}) -\exp{(o_j)} \right)}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= \frac{\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}} \left( \frac{\sum_{k=1}^{q} \exp{(o_k)} -\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}} \right)\\
&= \text{softmax}(o)_j \left(\frac{\sum_{k=1}^{q} \exp{(o_k)}}{\sum_{k=1}^{q} \exp{(o_k)}} -\frac{\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}}\right)\\
&= \text{softmax}(o)_j \left(1 -\frac{\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}}\right)\\
&= \text{softmax}(o)_j\left (1-\text{softmax}(o)_j \right)
\end{align}
$$

$$
\frac{\partial \text{softmax}(o)_j}{\partial o_{i\neq j}} \in \mathbb{R}
$$

Derivative for this case $i\neq j$:
$$
\begin{align}
\frac{\partial \text{softmax}(o)_j}{\partial o_{i\neq j}} &= \frac{\partial}{\partial o_i} \left(\frac{\exp(o_j)}{\sum_{k=1}^{q} \exp{o_i}} \right) \\
&= \frac{\frac{\partial}{\partial o_i}(\exp{(o_j)}) \sum_{k=1}^{q} \exp{(o_k)} -\exp{(o_j)} \frac{\partial}{\partial o_i} (\sum_{k=1}^{q} \exp{(o_k)})}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= -\frac{\exp{(o_j)} \exp{(o_i)}}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= -\frac{\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}} \frac{\exp{(o_i)}}{\sum_{k=1}^{q} \exp{(o_k)}} \\
&= -\text{softmax}(o)_j \text{softmax}(o_i)
\end{align}
$$