In [1]:
from autograd import jacobian, numpy as np

from platform import python_version
python_version()

'3.14.0'

In [2]:
# This cell imports torch_mape 
# if you are running this notebook locally 
# or from Google Colab.

import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

try:
    from tools.numpy_metrics import np_mape as mape
    print('mape imported locally.')
except ModuleNotFoundError:
    import subprocess

    repo_url = 'https://raw.githubusercontent.com/PilotLeoYan/inside-deep-learning/main/content/tools/numpy_metrics.py'
    local_file = 'numpy_metrics.py'
    
    subprocess.run(['wget', repo_url, '-O', local_file], check=True)
    try:
        from numpy_metrics import np_mape as mape # type: ignore
        print('mape imported from GitHub.')
    except Exception as e:
        print(e)

mape imported locally.


# Dataset

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

data = load_breast_cancer()
scaler = StandardScaler()

X = scaler.fit_transform(data.data)
Y = data.target

print(X.shape)
print(Y.shape)

(569, 30)
(569,)


# Defs

In [4]:
b: float = np.random.randn()
w: float = np.random.randn(X.shape[-1])

In [5]:
def weighted_sum(b, w):
    return b + X @ w

In [6]:
Z = weighted_sum(b, w)
Z.shape

(569,)

In [7]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [8]:
A = sigmoid(Z)
A.shape

(569,)

In [9]:
def bce(y_pred):
    return - np.mean(Y * np.log(y_pred) + (1 - Y) * np.log(1 - y_pred))

# BCE derivative

In [10]:
# true derivative
dbce_dyp = jacobian(bce)(A)
dbce_dyp.shape

(569,)

$$
\begin{align}
\frac{\partial \text{BCE}}{\partial a_{p}} &=
\frac{\partial}{\partial a_{p}}  \left( - \frac{1}{m} \sum_{i=1}^{m} y_{i} \log(a_{i}) + (1 - y_{i}) \log(1 - a_{i}) \right) \\
&= - \frac{1}{m} \sum_{i=1}^{m} \frac{\partial}{\partial a_{p}} \left( y_{i} \log(a_{i}) + (1 - y_{i}) \log(1 - a_{i}) \right) \\
&= - \frac{1}{m} \sum_{i=1}^{m} \frac{y_{i}}{a_{i}} \frac{\partial a_{i}}{\partial a_{p}} - \frac{1 - y_{i}}{1 - a_{i}} \frac{\partial a_{i}}{\partial a_{p}} \\
&= - \frac{1}{m} \sum_{i=1}^{m} \left( \frac{y_{i}}{a_{i}} - \frac{1 - y_{i}}{1 - a_{i}} \right) \frac{\partial a_{i}}{\partial a_{p}} \\
&= - \frac{1}{m} \sum_{i=1}^{m} \left( \frac{y_{i}}{a_{i}} - \frac{1 - y_{i}}{1 - a_{i}} \right) \delta_{ip} \\
&= - \frac{1}{m} \left( \frac{y_{p}}{a_{p}} - \frac{1 - y_{p}}{1 - a_{p}} \right) \\
\end{align}
$$

for all $p = 1, \ldots, m$.

Vectorized form

$$
\frac{\partial \text{BCE}}{\partial \mathbf{a}} = -\frac{1}{m}
\left( \mathbf{y \oslash a} - \mathbf{\left(1 - y \right) \oslash \left(1 - a \right)} \right)
$$

where $\oslash$ denotes *element wise division*, and $\mathbf{1} \in \mathbb{R}^{n_{o}}$.

In [11]:
# our derivative
our_dbce_dyp = - ((Y / A) - (1 - Y) / (1 - A)) / len(Y)
our_dbce_dyp.shape

(569,)

In [12]:
mape(our_dbce_dyp, dbce_dyp)

6.15254764000391e-17

# Sigmoid derivative

In [13]:
da_dz = jacobian(sigmoid)(Z)
da_dz.shape

(569, 569)

$$
\begin{align}
\frac{\partial a_{p}}{\partial z_{q}} &= 
\frac{\partial}{\partial z_{q}} \left( \frac{1}{1 + \exp(-z_{p})} \right) \\
&= \frac{\partial}{\partial z_{q}} \left( \left( 1 + \exp(-z_p)\right)^{-1} \right) \\
&= -\left( 1 + \exp(-z_p)\right)^{-2} \frac{\partial}{\partial z_{q}} \left(1 + \exp(-z_{p}) \right) \\
&= -\left( 1 + \exp(-z_p)\right)^{-2} \exp(-z_{p}) \frac{\partial (-z_{p})}{\partial z_{q}} \\
&= \left( 1 + \exp(-z_p)\right)^{-2} \exp(-z_{p}) \delta_{pq} \\
&= \frac{\exp(-z_{p})}{(1 + \exp(-z_p))^{2}} \delta_{pq} \\
&= \frac{1}{1 + \exp(-z_p)} \left( \frac{\exp(-z_{p})}{1 + \exp(-z_p)} \right) \delta_{pq} \\
&= a_{p} \left( \frac{\exp(-z_{p})}{1 + \exp(-z_p)} \right) \delta_{pq} \\
&= a_{p} \left( \frac{1 + \exp(-z_{p}) - 1}{1 + \exp(-z_p)} \right) \delta_{pq} \\
&= a_{p} \left( \frac{1 + \exp(-z_p)}{1 + \exp(-z_p)} - \frac{1}{1 + \exp(-z_p)}\right) \delta_{pq} \\
&= a_{p} \left( 1 - a_{p} \right) \delta_{pq} \\
\end{align}
$$

for all $p, q = 1, \ldots, m$.

In [14]:
our_da_dz = np.einsum(
    'p,pq->pq',
    A * (1 - A),
    np.eye(len(A))
)
our_da_dz.shape

(569, 569)

In [15]:
mape(our_da_dz, da_dz)

1.7024351400309893e-12

# Weighted sum derivative

## respect to bias

In [16]:
dz_db = jacobian(weighted_sum, 0)(b, w)
dz_db.shape

(569,)

$$
\begin{align}
\frac{\partial z_{q}}{\partial b} &= 
\frac{\partial}{\partial b} \left( b + x_{qk} w_{k} \right) \\
&= 1
\end{align}
$$

for $q = 1, \ldots, m$.

Vectorized form

$$
\frac{\partial \mathbf{z}}{\partial b} = \mathbf{1}
$$

where $\mathbf{1} \in \mathbb{R}^{m}$.

In [17]:
our_dz_db = np.ones((len(Z)))
our_dz_db.shape

(569,)

In [18]:
mape(our_dz_db, dz_db)

0.0

## respect to weight

In [19]:
dz_dw = jacobian(weighted_sum, 1)(b, w)
dz_dw.shape

(569, 30)

$$
\begin{align}
\frac{\partial z_{q}}{\partial w_{r}} &=
\frac{\partial}{\partial w_{r}} \left( b + x_{qk} w_{k} \right) \\
&= x_{qk} \frac{\partial w_{k}}{\partial w_r} \\
&= x_{qk} \delta_{kr} \\
&= x_{qr}
\end{align}
$$

for all $q = 1, \ldots, m$, and $r = 1, \ldots, n_{o}$.

Vectorized form

$$
\frac{\partial \mathbf{z}}{\partial \mathbf{w}} = \mathbf{X}
$$

In [20]:
our_dz_dw = X
our_dz_dw.shape

(569, 30)

In [21]:
mape(our_dz_dw, dz_dw)

0.0

# complete derivative

## respect to z

In [22]:
dbce_dz = jacobian(
    lambda z: 
    bce(sigmoid(z))
, 0)(Z)

dbce_dz.shape

(569,)

$$
\begin{align}
\frac{\partial \text{BCE}}{\partial z_{q}} &= \sum_{p}
{\color{Cyan} \frac{\partial \text{BCE}}{\partial a_{p}}}
{\color{Orange} \frac{\partial a_{p}}{\partial z_{q}}} \\
&= \sum_{p}
{\color{Cyan} - \frac{1}{m} \left( \frac{y_{p}}{a_{p}} - \frac{1 - y_{p}}{1 - a_{p}} \right)}
{\color{Orange} a_{p} \left( 1 - a_{p} \right) \delta_{pq}} \\
&= - \frac{1}{m} \left( \frac{y_{q}}{a_{q}} - \frac{1 - y_{q}}{1 - a_{q}} \right)
a_{q} \left( 1 - a_{q} \right) \\
&= - \frac{1}{m} \left( \frac{y_{q} a_{q} \left( 1 - a_{q} \right)}{a_{q}} - \frac{a_{q} (1 - y_{q}) \left( 1 - a_{q} \right)}{1 - a_{q}} \right) \\
&= - \frac{1}{m} \left( y_{q} \left( 1 - a_{q} \right) - a_{q} (1 - y_{q}) \right) \\
&= - \frac{1}{m} \left( y_{q} - y_{q}a_{q} - a_{q} + y_{q}a_{q} \right) \\
&= - \frac{1}{m} \left( y_{q} - a_{q} \right) \\
&= \frac{1}{m} \left( a_{q} - y_{q} \right) \\
\end{align}
$$

Vectorized form

$$
\frac{\partial \text{BCE}}{\partial \mathbf{z}} = 
\frac{1}{m} \left( \mathbf{a - y} \right)
$$

In [23]:
our_dbce_dz = (A - Y) / len(Y)
our_dbce_dz.shape

(569,)

In [24]:
mape(our_dbce_dz, dbce_dz)

9.686856123198863e-10

***

Using einsum

In [25]:
mape(
    np.einsum('p,pq->q', our_dbce_dyp, our_da_dz), 
    dbce_dz
)

9.686856246269475e-10

## respect to bias

In [26]:
dbce_db = jacobian(
    lambda b_: 
    bce(sigmoid(weighted_sum(b_, w)))
, 0)(b)

dbce_db

array(-0.18635995)

$$
\begin{align}
\frac{\partial \text{BCE}}{\partial b} &= \sum_{q}^{m}
{\color{Magenta} \frac{\partial \text{BCE}}{\partial z_{q}}}
{\color{Brown} \frac{\partial z_{q}}{\partial b}} \\
&= \sum_{q}^{m}
{\color{Magenta} \frac{1}{m} \left( a_{q} - y_{q} \right)}
{\color{Brown} 1} \\
&= \frac{1}{m} \sum_{q}^{m} \left( a_{q} - y_{q} \right) \\
&= \frac{1}{m} \left( \mathbf{a - y} \right)^{\top} \mathbf{1}
\end{align}
$$

where $\mathbf{1} \in \mathbb{R}^{m}$.

In [27]:
our_dbce_db = np.mean(A - Y)
our_dbce_db

np.float64(-0.18635994535986664)

In [28]:
mape(our_dbce_db.item(), dbce_db.item())

2.1849062685214934e-10

***

Using einsum

In [29]:
mape(
    np.einsum('q,q->', our_dbce_dz, our_dz_db).item(),
    dbce_db.item()
)

2.1849062685214934e-10

## respect to weight

In [30]:
dbce_dw = jacobian(
    lambda w_: 
    bce(sigmoid(weighted_sum(b, w_)))
, 0)(w)

dbce_dw.shape

(30,)

$$
\begin{align}
\frac{\partial \text{BCE}}{\partial w_{r}} &= \sum_{q}^{m}
{\color{Magenta} \frac{\partial \text{BCE}}{\partial z_{q}}}
{\color{Brown} \frac{\partial z_{q}}{\partial w_{r}}} \\
&= \sum_{q}^{m}
{\color{Magenta} \frac{1}{m} \left( a_{q} - y_{q} \right)}
{\color{Brown} x_{qr}} \\
&= \frac{1}{m} \sum_{q}^{m} \left( a_{q} - y_{q} \right) x_{qr} \\
&= \frac{1}{m} \left( \mathbf{a - y} \right)^{\top} \mathbf{x}_{:,r} \\
&= \frac{1}{m} \left(\mathbf{x}_{:,r}\right)^{\top} \left( \mathbf{a - y} \right)
\end{align}
$$

Vectorized form

$$
\frac{\partial \text{BCE}}{\partial \mathbf{w}} = 
\frac{1}{m} \mathbf{X}^{\top} \left( \mathbf{a - y} \right)
$$

In [31]:
our_dbce_dw = X.T @ (A - Y) / len(Y)
our_dbce_dw.shape

(30,)

In [32]:
mape(our_dbce_dw, dbce_dw)

3.8244341499936197e-10

***

Using einsum

In [33]:
mape(
    np.einsum('q,qr->r', our_dbce_dz, our_dz_dw),
    dbce_dw
)

3.824433087235348e-10