In [1]:
from autograd import jacobian, numpy as np

from platform import python_version
python_version()

'3.14.0'

In [2]:
# This cell imports torch_mape 
# if you are running this notebook locally 
# or from Google Colab.

import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

try:
    from tools.numpy_metrics import np_mape as mape
    print('mape imported locally.')
except ModuleNotFoundError:
    import subprocess

    repo_url = 'https://raw.githubusercontent.com/PilotLeoYan/inside-deep-learning/main/content/tools/numpy_metrics.py'
    local_file = 'numpy_metrics.py'
    
    subprocess.run(['wget', repo_url, '-O', local_file], check=True)
    try:
        from numpy_metrics import np_mape as mape # type: ignore
        print('mape imported from GitHub.')
    except Exception as e:
        print(e)

mape imported locally.


# Dataset

In [28]:
from sklearn.datasets import make_regression

X, Y = make_regression(
    n_samples=100, n_features=5,
    n_targets=4, n_informative=3,
    bias=np.random.randn(),
    noise=0.1, shuffle=True
)

print(X.shape)
print(Y.shape)

(100, 5)
(100, 4)


# Defs

In [32]:
b: float = np.random.randn(Y.shape[-1])
w: float = np.random.randn(
    X.shape[-1], Y.shape[-1])

In [33]:
def weighted_sum(b, w):
    return b + X @ w

In [34]:
y_pred = weighted_sum(b, w)
y_pred.shape

(100, 4)

In [35]:
def mse(y_pred):
    return np.mean((y_pred - Y) ** 2)

# MSE derivative

In [36]:
# true derivative
dmse_dyp = jacobian(mse)(y_pred)
dmse_dyp.shape

(100, 4)

$$
\begin{align}
\frac{\partial \text{MSE}}{\partial \hat{y}_{pq}} &= 
\frac{\partial}{\partial \hat{y}_{pq}} \left( \frac{1}{mn_{o}} \sum_{i=1}^{m} \sum_{j=1}^{n_o} \left( \hat{y}_{ij} - y_{ij} \right)^{2} \right) \\
&= \frac{1}{mn_{o}} \sum_{i=1}^{m} \sum_{j=1}^{n_o} \frac{\partial}{\partial \hat{y}_{pq}} \left( \left( \hat{y}_{ij} - y_{ij} \right)^{2} \right) \\
&= \frac{2}{mn_{o}} \sum_{i=1}^{m} \sum_{j=1}^{n_o} \left( \hat{y}_{ij} - y_{ij} \right) \frac{\partial \hat{y}_{ij}}{\partial \hat{y}_{pq}} \\
&= \frac{2}{mn_{o}} \sum_{i=1}^{m} \sum_{j=1}^{n_o} \left( \hat{y}_{ij} - y_{ij} \right) \delta_{ip} \delta_{jq} \\
&= \frac{2}{mn_{o}} \left( \hat{y}_{pq} - y_{pq} \right) \\
\end{align}
$$

for all $p = 1, \ldots, m$, and $q = 1, \ldots, n_{o}$. 

Vectorized form

$$
\frac{\partial \text{MSE}}{\partial \hat{\mathbf{Y}}} = 
\frac{2}{mn_{o}} \left( 
\hat{\mathbf{Y}} - \mathbf{Y} \right)
$$

In [37]:
# our derivative
our_dmse_dyp = 2 * (y_pred - Y) / Y.size
our_dmse_dyp.shape

(100, 4)

In [38]:
mape(our_dmse_dyp, dmse_dyp)

2.1647946012201064e-17

# Weighted sum derivative

## respect to bias

In [39]:
dyp_db = jacobian(weighted_sum, 0)(b, w)
dyp_db.shape

(100, 4, 4)

$$
\begin{align}
\frac{\partial \hat{y}_{pq}}{\partial b_{r}} &= 
\frac{\partial}{\partial b_{r}} \left( b_{q} + x_{pk} w_{kq} \right) \\
&= \frac{\partial b_{q}}{\partial b_{r}} \\
&= \delta_{qr} 
\end{align}
$$

for all $p = 1, \ldots, m$, and $q, r = 1, \ldots, n_{o}$

In [40]:
our_dyp_db = np.zeros(
    shape=(len(Y), Y.shape[-1], Y.shape[-1]))
q = np.arange(Y.shape[-1])
our_dyp_db[:, q, q] = 1
our_dyp_db.shape

(100, 4, 4)

In [41]:
mape(our_dyp_db, dyp_db)

0.0

## respect to weight

In [42]:
dyp_dw = jacobian(weighted_sum, 1)(b, w)
dyp_dw.shape

(100, 4, 5, 4)

$$
\begin{align}
\frac{\partial \hat{y}_{pq}}{\partial w_{rs}} &= \frac{\partial}{\partial w_{rs}} \left( b_{q} + x_{pk} w_{kq} \right) \\
&= x_{pk} \frac{\partial w_{kq}}{\partial w_{rs}} \\
&= x_{pk} \delta_{kr} \delta_{qs} \\
&= x_{pr} \delta_{qs}
\end{align}
$$

for all $p = 1, \ldots, m$; $q, s = 1, \ldots, n_{o}$, and $r = 1, \ldots, n$.

In [43]:
our_dyp_dw = np.einsum(
    'pr,qs->pqrs',
    X,
    np.eye(Y.shape[-1])
)
our_dyp_dw.shape

(100, 4, 5, 4)

In [44]:
mape(our_dyp_dw, dyp_dw)

0.0

# complete derivative

## respect to bias

In [45]:
dmse_db = jacobian(
    lambda b_: 
    mse(weighted_sum(b_, w))
, 0)(b)

dmse_db.shape

(4,)

$$
\begin{align}
\frac{\partial \text{MSE}}{\partial b_{r}} &= 
{\color{Cyan} \frac{\partial \text{MSE}}{\partial \hat{y}_{pq}}}
{\color{Orange} \frac{\partial \hat{y}_{pq}}{\partial b_{r}}} \\
&= \sum_{p=1}^{m} \sum_{q=1}^{n_{o}} {\color{Cyan} \frac{2}{mn_{o}} \left( \hat{y}_{pq} - y_{pq} \right)} 
{\color{Orange} \delta_{qr}} \\
&= \frac{2}{mn_{o}} \sum_{p=1}^{m} \left( \hat{y}_{pr} - y_{pr} \right) \\
&= \frac{2}{mn_{o}} \left< \mathbf{1}, \hat{\mathbf{y}}_{:,r} - \mathbf{y}_{:,r} \right> \\
&= \frac{2}{mn_{o}} \mathbf{1}^{\top} \left( \hat{\mathbf{y}}_{:,r} - \mathbf{y}_{:,r} \right) \\
\end{align}
$$

for all $r = 1, \ldots, n_{o}$, where $\mathbf{1} \in \mathbb{R}^{m}$.

Vectorized form

$$
\frac{\partial \text{MSE}}{\partial \mathbf{b}} = \frac{2}{mn_{o}}
\mathbf{1}^{\top} \left( \hat{\mathbf{Y}} - \mathbf{Y} \right)
$$

In [47]:
our_dmse_db = 2 / Y.size * np.sum(y_pred - Y, axis=0)
our_dmse_db.shape

(4,)

In [48]:
mape(our_dmse_db, dmse_db)

4.8119104419358935e-16

***

Using einsum

In [49]:
our_dmse_dyp.shape

(100, 4)

In [50]:
our_dyp_db.shape

(100, 4, 4)

In [51]:
mape(
    np.einsum('pq,pqr->r', our_dmse_dyp, our_dyp_db),
    dmse_db
)

3.986178649445095e-17

## respect to weight

In [52]:
dmse_dw = jacobian(
    lambda w_: 
    mse(weighted_sum(b, w_))
, 0)(w)

dmse_dw.shape

(5, 4)

$$
\begin{align}
\frac{\partial \text{MSE}}{\partial w_{rs}} &= 
{\color{Cyan} \frac{\partial \text{MSE}}{\partial \hat{y}_{pq}}}
{\color{Orange} \frac{\partial \hat{y}_{pq}}{\partial w_{rs}}} \\
&= \sum_{p=1}^{m} \sum_{q=1}^{n_{o}}
{\color{Cyan} \frac{2}{mn_{o}} \left( \hat{y}_{pq} - y_{pq} \right)} 
{\color{Orange} x_{pr} \delta_{qs}} \\
&= \frac{2}{mn_{o}} \sum_{p=1}^{m} \sum_{q=1}^{n_{o}} \left[ \hat{\mathbf{Y}} - \mathbf{Y} \right]_{pq} x_{pr} \delta_{qs} \\
&= \frac{2}{mn_{o}} \sum_{p=1}^{m} \left[ \hat{\mathbf{Y}} - \mathbf{Y} \right]_{ps} x_{pr} \\
&= \frac{2}{mn_{o}} \sum_{p=1}^{m} \left( \hat{y}_{ps} - y_{ps} \right) x_{pr} \\
&= \frac{2}{mn_{o}} \left< \mathbf{x}_{:,r}, \hat{\mathbf{y}}_{:,s} - \mathbf{y}_{:,s} \right> \\
&= \frac{2}{mn_{o}} (\mathbf{x}_{:,r})^{\top} \left( \hat{\mathbf{y}}_{:,s} - \mathbf{y}_{:,s} \right) \\
\end{align}
$$

for all $r = 1, \ldots, n$, and $s = 1, \ldots, n_{o}$.

Vectorized form

$$
\frac{\partial \text{MSE}}{\partial \mathbf{W}} = \frac{2}{mn_{o}}
\mathbf{X}^{\top} \left( \hat{\mathbf{Y}} - \mathbf{Y} \right)
$$

In [54]:
our_dmse_dw = 2 / Y.size * X.T @ (y_pred - Y)
our_dmse_dw.shape

(5, 4)

In [55]:
mape(our_dmse_dw, dmse_dw)

2.956070478119696e-16

***

Using einsum

In [56]:
mape(
    np.einsum('pq,pqrs->rs', our_dmse_dyp, our_dyp_dw),
    dmse_dw
)

6.460491923591371e-16