In [23]:
from tensorflow.keras.layers import Softmax
from tensorflow.keras.config import set_floatx
import torch
import autograd.numpy as np
from autograd import jacobian

set_floatx('float64')
torch.set_default_dtype(torch.float64)

In [62]:
'''
TODO
- cambiar la definición de softmax
-- cambiar sumatoria ^q => ^Q
'''

'\nTODO\n- cambiar la definición de softmax\n-- cambiar sumatoria ^q => ^Q\n'

In [24]:
def error(a_true, a_pred):
    e = np.abs(a_true.numpy() - a_pred.numpy()) / np.abs(a_true.numpy())
    return np.mean(e) * 100

def error_g(a_true, a_pred):
    e = np.abs(a_true - a_pred.numpy()) / np.abs(a_true)
    return np.mean(e) * 100

# Softmax function

## one example/entry

In [25]:
M, Q = 100, 5
X = torch.randint(-10, 11, (1, Q)) / 2
X

tensor([[ 3.0000, -3.0000,  4.0000, -1.5000, -4.5000]])

Softmax
$$
\text{softmax}(\boldsymbol{o})_j = \frac{\exp(o)_j}{\sum_{k=1}^{q} \exp(o_k)}
$$
where $q$ is the number of clases

In [26]:
SOFTMAX = Softmax()
tf_soft = SOFTMAX(X)
tf_soft.numpy()

array([[2.67922497e-01, 6.64113473e-04, 7.28288856e-01, 2.97635009e-03,
        1.48183746e-04]])

In [27]:
def softmax_1(x):
    exp = torch.exp(x)
    return exp / exp.sum()

my_soft = softmax_1(X)
my_soft.numpy()

array([[2.67922497e-01, 6.64113473e-04, 7.28288856e-01, 2.97635009e-03,
        1.48183746e-04]])

In [28]:
# Error
error(tf_soft, my_soft)

1.2886447193544136e-14

## multiples examples/entrys

In [29]:
X = torch.randint(-10, 11, (M, Q)) / 2
X.shape

torch.Size([100, 5])

In [30]:
tf_soft = SOFTMAX(X)
tf_soft.shape

TensorShape([100, 5])

In [31]:
def softmax(x):
    exp = torch.exp(x)
    return exp / exp.sum(dim=-1, keepdims=True)

my_soft = softmax(X)
my_soft.shape

torch.Size([100, 5])

In [32]:
# error
error(tf_soft, my_soft)

1.1781507582645494e-14

# Derivative

## using jacobian

### $\frac{\partial \text{softmax}(o)_j}{\partial \boldsymbol{o}}$

In [51]:
NUM_Q = 3

X = torch.randint(-10, 11, (1, Q)) / 2

def soft(x):
    exp = np.exp(x)
    return exp[0,NUM_Q] / np.sum(exp, axis=-1)

gradient = jacobian(soft)
grad = gradient(X.numpy()).T #.T because its numerator layout
print(grad.shape)
grad = grad[:,:,0]
print(grad.shape)
grad

(5, 1, 1)
(5, 1)


array([[-0.00406404],
       [-0.00406404],
       [-0.00055001],
       [ 0.14326054],
       [-0.13458246]])

$$
\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial \boldsymbol{o}} \in (1, Q) \times (1) \Leftrightarrow (Q,1)
$$
because $\boldsymbol{o} \in (1, Q)$ and $\text{softmax}(\boldsymbol{o})_j \in \mathbb{R}$
<br> <br>
Then, the jacobian in **Denominator layout** is:
$$
\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial \boldsymbol{o}} = \begin{bmatrix}
\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial o_1} & \cdots & \frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial o_j} & \cdots & \frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial o_Q}
\end{bmatrix}^T
$$
there are two diferente types of the derivatives:
1. $\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial o_{i=j}}$

2. $\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial o_{i\neq j}}$

$\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial o_{i=j}}$
$$
\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial o_{i=j}} = \text{softmax}(\boldsymbol{o})_j\left (1-\text{softmax}(\boldsymbol{o})_j \right)
$$

$\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial o_{i\neq j}}$
$$
\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial o_{i\neq j}} = -\text{softmax}(\boldsymbol{o})_j \text{softmax}(\boldsymbol{o})_i
$$
Therefore:
$$
\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial \boldsymbol{o}} = \begin{bmatrix}
-\text{softmax}(\boldsymbol{o})_j \text{softmax}(\boldsymbol{o})_{1} & \cdots & \text{softmax}(\boldsymbol{o})_j\left (1-\text{softmax}(\boldsymbol{o})_j \right) & \cdots & -\text{softmax}(\boldsymbol{o})_j \text{softmax}(\boldsymbol{o})_Q
\end{bmatrix}^T
$$
Vectorized form:
$$
\frac{\partial \text{softmax}(\boldsymbol{o})_j}{\partial \boldsymbol{o}} = \text{softmax}(\boldsymbol{o})_j \odot \begin{bmatrix}
-\text{softmax}(\boldsymbol{o})_{1} & \cdots & 1-\text{softmax}(\boldsymbol{o})_j & \cdots & -\text{softmax}(\boldsymbol{o})_Q
\end{bmatrix}^T
$$

In [52]:
def softmax_der_1(x, j):
    soft = softmax(x).T
    soft_j = soft[j,0].item() # get softmax(o)_j
    soft *= -1 
    soft[j,0] += 1
    return soft_j * soft

my_gradient = softmax_der_1(X, NUM_Q)
my_gradient.numpy()

array([[-0.00406404],
       [-0.00406404],
       [-0.00055001],
       [ 0.14326054],
       [-0.13458246]])

In [53]:
# error
error_g(grad, my_gradient)

2.047897635964624e-14

### $\frac{\partial \text{softmax}(\boldsymbol{o})}{\partial \boldsymbol{o}}$

In [58]:
del gradient, grad, X, my_gradient

X = torch.randint(-10, 11, (1, Q)) / 2

def soft_numpy(x):
    exp = np.exp(x)
    return exp / np.sum(exp)

gradient = jacobian(soft_numpy)
grad = gradient(X.numpy()).T
print(grad.shape)

grad = grad[:,0,:,0]
grad.shape

(5, 1, 5, 1)


(5, 5)

Using jacobian matrix as **Denominator layout** and the derivatives that we found, then this derivative is:
$$
\frac{\partial \text{soft}(\boldsymbol{o})}{\partial \boldsymbol{o}} = \begin{bmatrix}
\frac{\partial \text{soft}(\boldsymbol o)_1}{\partial \boldsymbol o} & \frac{\partial \text{soft}(\boldsymbol o)_2}{\partial \boldsymbol o} & \cdots & \frac{\partial \text{soft}(\boldsymbol o)_Q}{\partial \boldsymbol o}
\end{bmatrix} =
\begin{bmatrix}
\frac{\partial \text{soft}(\boldsymbol{o})_1}{\partial \boldsymbol{o}_1} & \frac{\partial \text{soft}(\boldsymbol{o})_2}{\partial \boldsymbol{o}_1} & \cdots  & \frac{\partial \text{soft}(\boldsymbol{o})_Q}{\partial \boldsymbol{o}_1}\\ 
\frac{\partial \text{soft}(\boldsymbol{o})_1}{\partial \boldsymbol{o}_2} & \frac{\partial \text{soft}(\boldsymbol{o})_2}{\partial \boldsymbol{o}_2} & \cdots & \frac{\partial \text{soft}(\boldsymbol{o})_Q}{\partial \boldsymbol{o}_2}\\ 
\vdots & \vdots & \ddots & \vdots\\ 
\frac{\partial \text{soft}(\boldsymbol{o})_1}{\partial \boldsymbol{o}_Q} & \frac{\partial \text{soft}(\boldsymbol{o})_2}{\partial \boldsymbol{o}_Q} & \cdots & \frac{\partial \text{soft}(\boldsymbol{o})_Q}{\partial \boldsymbol{o}_Q}
\end{bmatrix} \in (Q,Q)
$$
**Remark:** we abbreviate "softmax" to "soft". <br>
And using this derivatives:
$$
\begin{align}
\frac{\partial \text{soft}(\boldsymbol o)_j}{\partial o_{i=j}} &= \text{soft}(\boldsymbol o)_j\left (1-\text{soft}(\boldsymbol o)_j \right) \\
\frac{\partial \text{soft}(\boldsymbol o)_j}{\partial o_{i\neq j}} &= -\text{soft}(\boldsymbol o)_j \text{soft}(\boldsymbol o)_i
\end{align}
$$
Thefore:
$$
\frac{\partial \text{soft}(\boldsymbol{o})}{\partial \boldsymbol{o}} = \begin{bmatrix}
\text{soft}(\boldsymbol{o})_1(1-\text{soft}(\boldsymbol{o})_1) & -\text{soft}(\boldsymbol{o})_1\text{soft}(\boldsymbol{o})_2 & \cdots & -\text{soft}(\boldsymbol{o})_1\text{soft}(\boldsymbol{o})_Q\\ 
-\text{soft}(\boldsymbol{o})_2\text{soft}(\boldsymbol{o})_1 & \text{soft}(\boldsymbol{o})_2(1-\text{soft}(\boldsymbol{o})_2) & \cdots & -\text{soft}(\boldsymbol{o})_2\text{soft}(\boldsymbol{o})_Q\\ 
\vdots & \vdots & \ddots & \vdots\\ 
-\text{soft}(\boldsymbol{o})_Q\text{soft}(\boldsymbol{o})_1 & -\text{soft}(\boldsymbol{o})_Q\text{soft}(\boldsymbol{o})_2 & \cdots & \text{soft}(\boldsymbol{o})_Q(1-\text{soft}(\boldsymbol{o})_Q)
\end{bmatrix}
$$
Vectorized form:
$$
\begin{align}
\frac{\partial \text{soft}(\boldsymbol{o})}{\partial \boldsymbol{o}} &= \begin{bmatrix}
\text{soft}(\boldsymbol{o})_1 & 0 & \cdots & 0\\ 
0 & \text{soft}(\boldsymbol{o})_2 & \cdots & 0\\ 
\vdots & \vdots & \ddots & \vdots\\ 
0 & 0 & \cdots & \text{soft}(\boldsymbol{o})_Q
\end{bmatrix} - \begin{bmatrix}
\text{soft}(\boldsymbol{o})_1^2 & \text{soft}(\boldsymbol{o})_1\text{soft}(\boldsymbol{o})_2 & \cdots & \text{soft}(\boldsymbol{o})_1\text{soft}(\boldsymbol{o})_Q\\ 
\text{soft}(\boldsymbol{o})_2\text{soft}(\boldsymbol{o})_1 & \text{soft}(\boldsymbol{o})_2^2 & \cdots & \text{soft}(\boldsymbol{o})_2\text{soft}(\boldsymbol{o})_Q\\ 
\vdots & \vdots & \ddots & \vdots\\ 
\text{soft}(\boldsymbol{o})_Q\text{soft}(\boldsymbol{o})_1 & \text{soft}(\boldsymbol{o})_Q\text{soft}(\boldsymbol{o})_2 & \cdots & \text{soft}(\boldsymbol{o})_Q^2
\end{bmatrix} \\
&= \text{diag}(\text{soft}(\boldsymbol{o})) - \text{soft}(\boldsymbol{o})^T\text{soft}(\boldsymbol{o})
\end{align}
$$


In [59]:
grad

array([[ 1.77839290e-02, -3.64423465e-06, -3.28043549e-04,
        -6.58893081e-03, -1.08633104e-02],
       [-3.64423465e-06,  2.01165356e-04, -3.64423465e-06,
        -7.31964096e-05, -1.20680477e-04],
       [-3.28043549e-04, -3.64423465e-06,  1.77839290e-02,
        -6.58893081e-03, -1.08633104e-02],
       [-6.58893081e-03, -7.31964096e-05, -6.58893081e-03,
         2.31446480e-01, -2.18195422e-01],
       [-1.08633104e-02, -1.20680477e-04, -1.08633104e-02,
        -2.18195422e-01,  2.40042723e-01]])

In [60]:
def softmax_der_2(x):
    s = softmax(x)[0,:]
    return torch.diag(s) - torch.outer(s, s)

my_gradient = softmax_der_2(X)
my_gradient

tensor([[ 1.7784e-02, -3.6442e-06, -3.2804e-04, -6.5889e-03, -1.0863e-02],
        [-3.6442e-06,  2.0117e-04, -3.6442e-06, -7.3196e-05, -1.2068e-04],
        [-3.2804e-04, -3.6442e-06,  1.7784e-02, -6.5889e-03, -1.0863e-02],
        [-6.5889e-03, -7.3196e-05, -6.5889e-03,  2.3145e-01, -2.1820e-01],
        [-1.0863e-02, -1.2068e-04, -1.0863e-02, -2.1820e-01,  2.4004e-01]])

In [61]:
error_g(grad, my_gradient)

1.0853235392070533e-14

### $\frac{\mathrm{d} \text{softmax}(\boldsymbol{O})}{\mathrm{d} \boldsymbol{O}}$

#### Problem Statement

$$
\boldsymbol{O} \in (M,Q)
$$
where $M$ is number of examples/entrys. Then softmax function is:
$$
\text{soft}(\boldsymbol{O}) = \begin{bmatrix}
\text{soft}(\boldsymbol{O}_{1,:})\\ 
\text{soft}(\boldsymbol{O}_{2,:})\\ 
\vdots\\ 
\text{soft}(\boldsymbol{O}_{M,:})
\end{bmatrix} \in (M,Q)
$$
where
$$
\boldsymbol{O}_{p,:} = \begin{bmatrix}
O_{p,1} & O_{p,2} & \cdots & O_{p,Q}
\end{bmatrix} \in (1,Q)
$$
for all $p=1,...,M$ <br>
therefore
$$
\text{soft}(\boldsymbol{O}_{p,:}) = \begin{bmatrix}
\text{soft}(\boldsymbol{O}_{p,:})_1 & \text{soft}(\boldsymbol{O}_{p,:})_2 & \cdots & \text{soft}(\boldsymbol{O}_{p,:})_Q
\end{bmatrix}\in (1,Q)
$$

#### Derivative

In [40]:
del gradient, grad, X, my_gradient

X = torch.randint(-10, 11, (M, Q)) / 2

def soft_numpy(x):
    exp = np.exp(x)
    return exp / np.sum(exp, axis=-1, keepdims=True)

gradient = jacobian(soft_numpy) #this uses numerator layout
grad = gradient(X.numpy())
print(grad.shape)

(100, 5, 100, 5)


First, we need know its shape:
$$
\begin{align}
\frac{\mathrm{d} {\color{Cyan} \text{soft}(\boldsymbol{O})}}{\mathrm{d} {\color{Orange} \boldsymbol{O}}} &\in {\color{Orange} (M,Q)} \times {\color{Cyan} (M,Q)}\\
\frac{\partial {\color{Cyan} \text{soft}(\boldsymbol{O}_{p,:})}}{\partial {\color{Orange} \boldsymbol{O}}} &\in {\color{Orange} (M,Q)} \times {\color{Cyan} (1,Q)} \\
\frac{\partial {\color{Cyan} \text{soft}(\boldsymbol{O}_{p,:})}}{\partial {\color{Orange} \boldsymbol{O}_{q,:}}} &\in {\color{Orange} (1,Q)} \times {\color{Cyan} (1,Q)} \Leftrightarrow (Q,Q)
\end{align}
$$
for all $p,q = 1, ..., M$. The last derivative $\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q,:}}$ is similar to our $\frac{\partial \text{soft}(\boldsymbol{o})}{\partial \boldsymbol{o}}$ but with some extra axis. <br>
Then, we can use jacobian as **Denominator layout** like:
$$
\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}} = \begin{bmatrix}
\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{1,:}} & \frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{2,:}} & \cdots & \frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{M,:}}
\end{bmatrix}^T \in (M,Q) \times (1,Q)
$$
and
$$
\frac{\mathrm{d} \text{soft}(\boldsymbol{O})}{\mathrm{d} \boldsymbol{O}} = \begin{bmatrix}
\frac{\partial \text{soft}(\boldsymbol{O}_{1,:})}{\partial \boldsymbol{O}} & \frac{\partial \text{soft}(\boldsymbol{O}_{2,:})}{\partial \boldsymbol{O}} & \cdots & \frac{\partial \text{soft}(\boldsymbol{O}_{M,:})}{\partial \boldsymbol{O}}
\end{bmatrix} \in (M,Q) \times (M,Q)
$$
for all $p,q = 1, ..., M$. <br>
But, how to compute $\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q,:}}$? There are two case:
1. $\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q=p,:}}$

2. $\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q \neq p,:}}$

$\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q=p,:}}$
$$
\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q=p,:}}=\text{diag}(\text{soft}(\boldsymbol{O}_{p,:})) - \text{soft}(\boldsymbol{O}_{p,:})^T\text{soft}(\boldsymbol{O}_{p,:})
$$
$\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q \neq p,:}}$
$$
\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q \neq p,:}}=\boldsymbol{0}
$$

In [41]:
def softmax_der_3(x):
    m, q_ = x.shape
    out = torch.zeros((m,q_,m,q_))
    for p in range(m):
        for q in range(m):
            if p == q:
                out[p,:,q,:] = softmax_der_2(x[None,p,:])
            #else: out[p,:,q,:] is zero by default
    return out

my_gradient = softmax_der_3(X)
my_gradient.shape

torch.Size([100, 5, 100, 5])

In [42]:
# + 1e-32 to avoid division by zero
error_g(grad + 1e-32, my_gradient + 1e-32)

1.963567018821016e-16

In [43]:
def softmax_der_4(x):
    # a bit better performance that softmax_der_3
    m, q_ = x.shape
    out = torch.zeros((m,q_,m,q_))
    for p in range(m):
        out[p,:,p,:] = softmax_der_2(x[None,p,:])
    return out

my_gradient = softmax_der_4(X)
my_gradient.shape

torch.Size([100, 5, 100, 5])

In [44]:
# + 1e-32 to avoid division by zero
error_g(grad + 1e-32, my_gradient + 1e-32)

1.963567018821016e-16

# Appendix A

$$
\begin{align}
\boldsymbol{o}_i \in \mathbb{R} \\
\text{softmax}(\boldsymbol{o})_j \in \mathbb{R}
\end{align} 
$$

then
$$
\frac{\partial \text{softmax}(\boldsymbol o)_j}{\partial o_{i=j}} \in \mathbb{R}
$$

Derivative for this case $i=j$:
$$
\begin{align}
\frac{\partial \text{softmax}(\boldsymbol o)_j}{\partial o_{i=j}} &= \frac{\partial}{\partial o_j} \left(\frac{\exp(o_j)}{\sum_{k=1}^{q} \exp{o_i}} \right) \\
&= \frac{\frac{\partial}{\partial o_j}(\exp{(o_j)}) \sum_{k=1}^{q} \exp{(o_k)} -\exp{(o_j)} \frac{\partial}{\partial o_j} (\sum_{k=1}^{q} \exp{(o_k)})}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= \frac{\exp{(o_j)} (\sum_{k=1}^{q} \exp{(o_k)}) -\exp{(o_j)}^2}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= \frac{\exp{(o_j)} \left( (\sum_{k=1}^{q} \exp{(o_k)}) -\exp{(o_j)} \right)}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= \frac{\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}} \left( \frac{\sum_{k=1}^{q} \exp{(o_k)} -\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}} \right)\\
&= \text{softmax}(o)_j \left(\frac{\sum_{k=1}^{q} \exp{(o_k)}}{\sum_{k=1}^{q} \exp{(o_k)}} -\frac{\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}}\right)\\
&= \text{softmax}(o)_j \left(1 -\frac{\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}}\right)\\
&= \text{softmax}(\boldsymbol o)_j\left (1-\text{softmax}(\boldsymbol o)_j \right)
\end{align}
$$
Derivative for this case $i\neq j$:
$$
\begin{align}
\frac{\partial \text{softmax}(\boldsymbol o)_j}{\partial o_{i\neq j}} &= \frac{\partial}{\partial o_i} \left(\frac{\exp(o_j)}{\sum_{k=1}^{q} \exp{o_i}} \right) \\
&= \frac{\frac{\partial}{\partial o_i}(\exp{(o_j)}) \sum_{k=1}^{q} \exp{(o_k)} -\exp{(o_j)} \frac{\partial}{\partial o_i} (\sum_{k=1}^{q} \exp{(o_k)})}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= -\frac{\exp{(o_j)} \exp{(o_i)}}{\left( \sum_{k=1}^{q} \exp{(o_k)} \right)^2} \\
&= -\frac{\exp{(o_j)}}{\sum_{k=1}^{q} \exp{(o_k)}} \frac{\exp{(o_i)}}{\sum_{k=1}^{q} \exp{(o_k)}} \\
&= -\text{softmax}(\boldsymbol o)_j \text{softmax}(\boldsymbol o)_i
\end{align}
$$

# Appendix B

$$
\boldsymbol{O} \in (M,Q) \\
\boldsymbol{O}_{p,:} = \begin{bmatrix}
O_{p,1} & O_{p,2} & \cdots & O_{p,Q}
\end{bmatrix} \in (1,Q)\\
\text{soft}(\boldsymbol{O}_{p,:}) = \begin{bmatrix}
\text{soft}(O_{p,:})_1 & \text{soft}(O_{p,:})_2 & \cdots & \text{soft}(O_{p,:})_Q
\end{bmatrix} \in (1,Q)
$$
then
$$
\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q,:}} \in (1,Q) \times (1,Q) \Leftrightarrow (Q,Q)
$$
for all $p,q = 1, ..., M$

$$
\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q,:}} = \begin{bmatrix}
\frac{\partial \text{soft}(O_{p,:})_1}{\partial O_{q,1}} & \frac{\partial \text{soft}(O_{p,:})_2}{\partial O_{q,1}} & \cdots & \frac{\partial \text{soft}(O_{p,:})_Q}{\partial O_{q,1}} \\
\frac{\partial \text{soft}(O_{p,:})_1}{\partial O_{q,2}} & \frac{\partial \text{soft}(O_{p,:})_2}{\partial O_{q,2}} & \cdots & \frac{\partial \text{soft}(O_{p,:})_Q}{\partial O_{q,2}} \\
\vdots & \vdots & \ddots & \vdots \\
\frac{\partial \text{soft}(O_{p,:})_1}{\partial O_{q,Q}} & \frac{\partial \text{soft}(O_{p,:})_2}{\partial O_{q,Q}} & \cdots & \frac{\partial \text{soft}(O_{p,:})_Q}{\partial O_{q,Q}} \\
\end{bmatrix}
$$

for case $q=p$
$$
\begin{align}
\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q=p,:}} &= \begin{bmatrix}
\frac{\partial \text{soft}(O_{p,:})_1}{\partial O_{p,1}} & \frac{\partial \text{soft}(O_{p,:})_2}{\partial O_{p,1}} & \cdots & \frac{\partial \text{soft}(O_{p,:})_Q}{\partial O_{p,1}} \\
\frac{\partial \text{soft}(O_{p,:})_1}{\partial O_{p,2}} & \frac{\partial \text{soft}(O_{p,:})_2}{\partial O_{p,2}} & \cdots & \frac{\partial \text{soft}(O_{p,:})_Q}{\partial O_{p,2}} \\
\vdots & \vdots & \ddots & \vdots \\
\frac{\partial \text{soft}(O_{p,:})_1}{\partial O_{p,Q}} & \frac{\partial \text{soft}(O_{p,:})_2}{\partial O_{p,Q}} & \cdots & \frac{\partial \text{soft}(O_{p,:})_Q}{\partial O_{p,Q}} \\
\end{bmatrix} \\
&= \begin{bmatrix}
\text{soft}(O_{p,:})_1(1-\text{soft}(O_{p,:})_1) & -\text{soft}(O_{p,:})_1\text{soft}(O_{p,:})_2 & \cdots & -\text{soft}(O_{p,:})_1\text{soft}(O_{p,:})_Q \\
-\text{soft}(O_{p,:})_2\text{soft}(O_{p,:})_1 & \text{soft}(O_{p,:})_2(1-\text{soft}(O_{p,:})_2) & \cdots & -\text{soft}(O_{p,:})_2\text{soft}(O_{p,:})_Q \\
\vdots & \vdots & \ddots & \vdots \\
-\text{soft}(O_{p,:})_Q\text{soft}(O_{p,:})_1 & -\text{soft}(O_{p,:})_Q\text{soft}(O_{p,:})_2 & \cdots & \text{soft}(O_{p,:})_Q(1-\text{soft}(O_{p,:})_Q) \\
\end{bmatrix} \\
&= \begin{bmatrix}
\text{soft}(O_{p,:})_1 & 0 & \cdots & 0 \\
0 & \text{soft}(O_{p,:})_2 & \cdots & 0 \\
\vdots & \vdots & \ddots & \vdots \\
0 & 0 & \cdots & \text{soft}(O_{p,:})_Q
\end{bmatrix} -\\
& \begin{bmatrix}
\text{soft}(O_{p,:})_1^2 & \text{soft}(O_{p,:})_1\text{soft}(O_{p,:})_2 & \cdots & \text{soft}(O_{p,:})_1\text{soft}(O_{p,:})_Q \\
\text{soft}(O_{p,:})_2\text{soft}(O_{p,:})_1 & \text{soft}(O_{p,:})_2^2 & \cdots & \text{soft}(O_{p,:})_2\text{soft}(O_{p,:})_Q \\
\vdots & \vdots & \ddots & \vdots \\
\text{soft}(O_{p,:})_Q\text{soft}(O_{p,:})_1 & \text{soft}(O_{p,:})_Q\text{soft}(O_{p,:})_2 & \cdots & \text{soft}(O_{p,:})_Q^2 \\
\end{bmatrix} \\
&= \text{diag}(\text{soft}(\boldsymbol{O}_{p,:})) -\text{soft}(\boldsymbol{O}_{p,:})^T \text{soft}(\boldsymbol{O}_{p,:})
\end{align}
$$

for case $q \neq p$
$$
\begin{align}
\frac{\partial \text{soft}(\boldsymbol{O}_{p,:})}{\partial \boldsymbol{O}_{q\neq p,:}} &= \begin{bmatrix}
\frac{\partial \text{soft}(O_{p,:})_1}{\partial O_{q,1}} & \frac{\partial \text{soft}(O_{p,:})_2}{\partial O_{q,1}} & \cdots & \frac{\partial \text{soft}(O_{p,:})_Q}{\partial O_{q,1}} \\
\frac{\partial \text{soft}(O_{p,:})_1}{\partial O_{q,2}} & \frac{\partial \text{soft}(O_{p,:})_2}{\partial O_{q,2}} & \cdots & \frac{\partial \text{soft}(O_{p,:})_Q}{\partial O_{q,2}} \\
\vdots & \vdots & \ddots & \vdots \\
\frac{\partial \text{soft}(O_{p,:})_1}{\partial O_{q,Q}} & \frac{\partial \text{soft}(O_{p,:})_2}{\partial O_{q,Q}} & \cdots & \frac{\partial \text{soft}(O_{p,:})_Q}{\partial O_{q,Q}} \\
\end{bmatrix} \\
&= \boldsymbol{0}
\end{align}
$$

example, if $p=1$ and $q=2$:
