In [1]:
from autograd import jacobian, numpy as np

from platform import python_version
python_version()

'3.14.0'

In [2]:
# This cell imports torch_mape 
# if you are running this notebook locally 
# or from Google Colab.

import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

try:
    from tools.numpy_metrics import np_mape as mape
    print('mape imported locally.')
except ModuleNotFoundError:
    import subprocess

    repo_url = 'https://raw.githubusercontent.com/PilotLeoYan/inside-deep-learning/main/content/tools/numpy_metrics.py'
    local_file = 'numpy_metrics.py'
    
    subprocess.run(['wget', repo_url, '-O', local_file], check=True)
    try:
        from numpy_metrics import np_mape as mape # type: ignore
        print('mape imported from GitHub.')
    except Exception as e:
        print(e)

mape imported locally.


# Dataset

In [6]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import OneHotEncoder

data = load_iris()
enc = OneHotEncoder(handle_unknown='ignore', 
    sparse_output=False)

X = data.data
Y = enc.fit_transform(data.target[:, None])

print(X.shape)
print(Y.shape)

(150, 4)
(150, 3)


# Defs

In [8]:
b: float = np.random.randn(Y.shape[-1])
w: float = np.random.randn(
    X.shape[-1], Y.shape[-1])

In [9]:
def weighted_sum(b, w):
    return b + X @ w

In [10]:
Z = weighted_sum(b, w)
Z.shape

(150, 3)

In [22]:
def softmax(z):
    e = np.exp(z)
    return e / (e.sum(axis=-1, keepdims=True))

In [23]:
A = softmax(Z)
A.shape

(150, 3)

In [28]:
def ce(y_pred):
    return - np.sum(Y * np.log(y_pred)) / len(Y)

# CE derivative

In [29]:
# true derivative
dce_dyp = jacobian(ce)(A)
dce_dyp.shape

(150, 3)

$$
\begin{align}
\frac{\partial \text{CE}}{\partial a_{pq}} &=
\frac{\partial}{\partial a_{pq}} \left( - \frac{1}{m} \sum_{i=1}^{m} \sum_{j=1}^{n_{o}} y_{ij} \log(a_{ij}) \right) \\
&= -\frac{1}{m} \sum_{i=1}^{m} \sum_{j=1}^{n_{o}} \frac{\partial}{\partial a_{pq}} \left(y_{ij} \log(a_{ij}) \right) \\
&= -\frac{1}{m} \sum_{i=1}^{m} \sum_{j=1}^{n_{o}} \frac{y_{ij}}{a_{ij}} \frac{\partial a_{ij}}{\partial a_{pq}} \\
&= -\frac{1}{m} \sum_{i=1}^{m} \sum_{j=1}^{n_{o}} \frac{y_{ij}}{a_{ij}} \delta_{ip} \delta_{jq} \\
&= -\frac{1}{m} \sum_{i=1}^{m} \frac{y_{iq}}{a_{iq}} \delta_{ip} \\
&= -\frac{1}{m} \left( \frac{y_{pq}}{a_{pq}} \right) \\
\end{align}
$$

for all $p, q = 1, \ldots, m$.

Vectorized form

$$
\frac{\partial \text{CE}}{\partial \mathbf{A}} =
- \frac{1}{m} \left( \mathbf{Y \oslash A} \right)
$$

where $\oslash$ denote *element wise division*.

In [32]:
# our derivative
our_dce_dyp = - (Y / A) / len(Y)
our_dce_dyp.shape

(150, 3)

In [33]:
mape(our_dce_dyp, dce_dyp)

2.4199313968930158e-17

# Softmax derivative

In [34]:
da_dz = jacobian(softmax)(Z)
da_dz.shape

(150, 3, 150, 3)

$$
\begin{align}
\frac{\partial a_{pq}}{\partial z_{rs}} &= 
\frac{\partial}{\partial z_{rs}} \left( \frac{\exp(z_{pq})} {\sum_{k=1}^{n_{o}} \exp(z_{pk})} \right) \\
\end{align}
$$

$$
\begin{align}
\frac{\partial}{\partial z_{rs}} \left( \exp(z_{pq}) \right) &= 
\exp(z_{pq}) \frac{\partial z_{pq}}{\partial z_{rs}} \\
&= \exp(z_{pq}) \delta_{pr} \delta_{qs} \\
\end{align}
$$

$$
\begin{align}
\frac{\partial}{\partial z_{rs}} \left( \sum_{k=1}^{n_{o}} \exp(z_{pk}) \right) &=
\sum_{k=1}^{n_{o}} \frac{\partial}{\partial z_{rs}} \exp(z_{pk}) \\
&= \sum_{k=1}^{n_{o}} \exp(z_{pk}) \frac{\partial z_{pk}}{\partial z_{rs}} \\
&= \sum_{k=1}^{n_{o}} \exp(z_{pk}) \delta_{pr} \delta_{ks} \\
&= \exp(z_{ps}) \delta_{pr} \\
\end{align}
$$

$$
\begin{align}
\frac{\partial a_{pq}}{\partial z_{rs}} &= 
\frac{\partial}{\partial z_{rs}} \left( \frac{\exp(z_{pq})} {\sum_{k=1}^{n_{o}} \exp(z_{pk})} \right) \\
&= \frac{\exp(z_{pq}) \delta_{pr} \delta_{qs} \left( \sum_{k=1}^{n_{o}} \exp(z_{pk}) \right) - \exp(z_{pq}) \exp(z_{ps}) \delta_{pr}}{\left( \sum_{k=1}^{n_{o}} \exp(z_{pk}) \right)^{2}} \\
&= \frac{\exp(z_{pq}) \left( \sum_{k=1}^{n_{o}} \exp(z_{pk}) \right)}{\left( \sum_{k=1}^{n_{o}} \exp(z_{pk}) \right)^{2}}  \delta_{pr} \delta_{qs}
- \frac{\exp(z_{pq}) \exp(z_{ps})}{\left( \sum_{k=1}^{n_{o}} \exp(z_{pk}) \right)^{2}} \delta_{pr}\\
&= \frac{\exp(z_{pq})}{\sum_{k=1}^{n_{o}} \exp(z_{pk})} \delta_{pr} \delta_{qs}
- \frac{\exp(z_{pq}) \exp(z_{ps})}{\left( \sum_{k=1}^{n_{o}} \exp(z_{pk}) \right)^{2}} \delta_{pr} \\
&= a_{pq} \delta_{pr} \delta_{qs}
- \frac{\exp(z_{pq}) \exp(z_{ps})}{\left( \sum_{k=1}^{n_{o}} \exp(z_{pk}) \right)^{2}} \delta_{pr} \\
&= a_{pq} \delta_{pr} \delta_{qs}
- \frac{\exp(z_{pq})}{\sum_{k=1}^{n_{o}} \exp(z_{pk})} \left( \frac{\exp(z_{ps})}{\sum_{k=1}^{n_{o}} \exp(z_{pk})} \right) (\delta_{pr}) \\
&= a_{pq} \delta_{pr} \delta_{qs}
- a_{pq} a_{ps} \delta_{pr} \\
&= a_{pq} \delta_{pr} \left( \delta_{qs} - a_{ps} \right) \\
&= a_{pq} \delta_{pr} \delta_{qs} - a_{pq} a_{ps} \delta_{pr}
\end{align}
$$

for all $p, r = 1, \ldots, m$, and $q, s = 1, \ldots, n_{o}$.

In [39]:
our_da_dz = np.einsum( # first term
    'pq,pr,qs->pqrs',
    A,
    np.eye(len(Y)),
    np.eye(Y.shape[-1])
) - np.einsum( # second term
    'pq,ps,pr->pqrs',
    A,
    A,
    np.eye(len(Y))
)
our_da_dz.shape

(150, 3, 150, 3)

In [40]:
mape(our_da_dz, da_dz)

7.726957522700263e-18

# Weighted sum derivative

## respect to bias

In [41]:
dz_db = jacobian(weighted_sum, 0)(b, w)
dz_db.shape

(150, 3, 3)

$$
\begin{align}
\frac{\partial z_{rs}}{\partial b_{t}} &= 
\frac{\partial}{\partial b_{t}} \left( b_{s} + x_{rk} w_{ks} \right) \\
&= \frac{\partial b_{s}}{\partial b_{t}} \\
&= \delta_{st}
\end{align}
$$

for $r = 1, \ldots, m$, and $r, t = 1, \ldots, n_{o}$.

In [43]:
our_dz_db = np.zeros((len(Y), Z.shape[-1], Z.shape[-1]))
s = np.arange(Z.shape[-1])
our_dz_db[:, s, s] = 1
our_dz_db.shape

(150, 3, 3)

In [44]:
mape(our_dz_db, dz_db)

0.0

## respect to weight

In [45]:
dz_dw = jacobian(weighted_sum, 1)(b, w)
dz_dw.shape

(150, 3, 4, 3)

$$
\begin{align}
\frac{\partial z_{rs}}{\partial w_{tu}} &=
\frac{\partial}{\partial w_{tu}} \left( b_{s} + x_{rk} w_{ks} \right) \\
&= x_{rk} \frac{\partial w_{ks}}{\partial w_{tu}} \\
&= x_{rk} \delta_{kt} \delta_{su} \\
&= x_{rt} \delta_{su}
\end{align}
$$

for all $r = 1, \ldots, m$; $t = 1, \ldots, n$, and $r, u = 1, \ldots, n_{o}$.

In [46]:
our_dz_dw = np.einsum(
    'rt,su->rstu',
    X,
    np.eye(Z.shape[-1])
)
our_dz_dw.shape

(150, 3, 4, 3)

In [47]:
mape(our_dz_dw, dz_dw)

0.0

# complete derivative

## respect to z

In [48]:
dce_dz = jacobian(
    lambda z: 
    ce(softmax(z))
, 0)(Z)

dce_dz.shape

(150, 3)

$$
\begin{align}
\frac{\partial \text{CE}}{\partial z_{rs}} &= \sum_{p, q}
{\color{Cyan} \frac{\partial \text{CE}}{\partial a_{pq}}}
{\color{Orange} \frac{\partial a_{pq}}{\partial z_{rs}}} \\
&= \sum_{p, q}
{\color{Cyan} -\frac{1}{m} \left( \frac{y_{pq}}{a_{pq}} \right)}
{\color{Orange} a_{pq} \delta_{pr} \left( \delta_{qs} - a_{ps} \right)} \\
&= -\frac{1}{m} \sum_{p, q} \frac{y_{pq} a_{pq} \delta_{pr} \left( \delta_{qs} - a_{ps} \right)}{a_{pq}} \\
&= -\frac{1}{m} \sum_{p, q} y_{pq} \delta_{pr} \left( \delta_{qs} - a_{ps} \right) \\
&= -\frac{1}{m} \sum_{q} y_{rq} \left( \delta_{qs} - a_{rs} \right) \\
&= -\frac{1}{m} \sum_{q} y_{rq} \delta_{qs} - y_{rq} a_{rs} \\
&= -\frac{1}{m} \sum_{q} y_{rq} \delta_{qs} - \sum_{q} y_{rq} a_{rs} \\
&= -\frac{1}{m} \left( y_{rs} - a_{rs} \sum_{q} y_{rq} \right) \\
&= -\frac{1}{m} \left( y_{rs} - a_{rs} \right) \\
&= \frac{1}{m} \left( a_{rs} - y_{rs} \right) \\
\end{align}
$$

**Note**: $\sum_{q} y_{rq} = 1$ due to **one-hot**.

Vectorized form

$$
\frac{\partial \text{CE}}{\partial \mathbf{Z}} = 
\frac{1}{m} \left( \mathbf{A - Y} \right)
$$

In [49]:
our_dce_dz = (A - Y) / len(Y)
our_dce_dz.shape

(150, 3)

In [50]:
mape(our_dce_dz, dce_dz)

1.0131507378187114e-15

***

Using einsum

In [53]:
mape(
    np.einsum('pq,pqrs->rs', our_dce_dyp, our_da_dz), 
    dce_dz
)

1.1503335310490922e-15

## respect to bias

In [60]:
dce_db = jacobian(
    lambda b_: 
    ce(softmax(weighted_sum(b_, w)))
, 0)(b)

dce_db.shape

(3,)

$$
\begin{align}
\frac{\partial \text{CE}}{\partial b_{t}} &= \sum_{r, s}
{\color{Magenta} \frac{\partial \text{CE}}{\partial z_{rs}}}
{\color{Brown} \frac{\partial z_{rs}}{\partial b_{t}}} \\
&= \sum_{r, s}
{\color{Magenta} \frac{1}{m} \left( a_{rs} - y_{rs} \right)}
{\color{Brown} \delta_{st}} \\
&= \frac{1}{m} \sum_{r, s} (a_{rs} - y_{rs}) \delta_{st} \\
&= \frac{1}{m} \sum_{r} (a_{rt} - y_{rt}) \\
&= \frac{1}{m} \mathbf{1}^{\top} (a_{:,t} - y_{:,t})
\end{align}
$$

where $\mathbf{1} \in \mathbb{R}^{m}$.

Vectorized form

$$
\frac{\partial \text{CE}}{\partial \mathbf{b}} = 
\frac{1}{m} \mathbf{1}^{\top} \left( \mathbf{A - Y} \right)
$$

In [61]:
our_dce_db = np.sum(A - Y, axis=0) / len(Y)
our_dce_db.shape

(3,)

In [62]:
mape(our_dce_db, dce_db)

2.8397213254216365e-16

***

Using einsum

In [65]:
mape(
    np.einsum('rs,rst->t', our_dce_dz, our_dz_db),
    dce_db
)

0.0

## respect to weight

In [67]:
dce_dw = jacobian(
    lambda w_: 
    ce(softmax(weighted_sum(b, w_)))
, 0)(w)

dce_dw.shape

(4, 3)

$$
\begin{align}
\frac{\partial \text{CE}}{\partial w_{tu}} &= \sum_{r, s}
{\color{Magenta} \frac{\partial \text{CE}}{\partial z_{rs}}}
{\color{Brown} \frac{\partial z_{rs}}{\partial w_{tu}}} \\
&= \sum_{r, s} 
{\color{Magenta} \frac{1}{m} \left( a_{rs} - y_{rs} \right)}
{\color{Brown} x_{rt} \delta_{su}} \\
&= \frac{1}{m} \sum_{r, s} x_{rt} \delta_{su} (a_{rs} - y_{rs}) \\
&= \frac{1}{m} \sum_{r} x_{rt} (a_{ru} - y_{ru}) \\
&= \frac{1}{m} (\mathbf{x}_{:,t})^{\top} (a_{:, u} - y_{:, u}) 
\end{align}
$$

Vectorized form

$$
\frac{\partial \text{CE}}{\partial \mathbf{w}} = 
\frac{1}{m} \mathbf{X}^{\top} \left( \mathbf{A - Y} \right)
$$

In [68]:
our_dce_dw = X.T @ (A - Y) / len(Y)
our_dce_dw.shape

(4, 3)

In [69]:
mape(our_dce_dw, dce_dw)

2.938988740844495e-16

***

Using einsum

In [72]:
mape(
    np.einsum('rs,rstu->tu', our_dce_dz, our_dz_dw),
    dce_dw
)

1.220174995069426e-16