In [1]:
from autograd import jacobian, numpy as np

from platform import python_version
python_version()

'3.14.0'

In [2]:
# This cell imports torch_mape 
# if you are running this notebook locally 
# or from Google Colab.

import os
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

try:
    from tools.numpy_metrics import np_mape as mape
    print('mape imported locally.')
except ModuleNotFoundError:
    import subprocess

    repo_url = 'https://raw.githubusercontent.com/PilotLeoYan/inside-deep-learning/main/content/tools/numpy_metrics.py'
    local_file = 'numpy_metrics.py'
    
    subprocess.run(['wget', repo_url, '-O', local_file], check=True)
    try:
        from numpy_metrics import np_mape as mape # type: ignore
        print('mape imported from GitHub.')
    except Exception as e:
        print(e)

mape imported locally.


# Dataset

In [3]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

X = housing.data
Y = housing.target

print(X.shape)
print(Y.shape)

(20640, 8)
(20640,)


# Defs

In [4]:
b: float = np.random.randn()
w: float = np.random.randn(X.shape[-1])

In [5]:
def weighted_sum(b, w):
    return b + X @ w

In [6]:
y_pred = weighted_sum(b, w)
y_pred.shape

(20640,)

In [7]:
def mse(y_pred):
    return np.mean((y_pred - Y) ** 2)

# MSE derivative

In [8]:
# true derivative
dmse_dyp = jacobian(mse)(y_pred)
dmse_dyp.shape

(20640,)

$$
\begin{align}
\frac{\partial \text{MSE}}{\partial \hat{y}_{p}} &= \frac{\partial}{\partial \hat{y}_{p}} \left( \frac{1}{m} \sum_{i=1}^{m} \left( \hat{y}_{i} - y_{i} \right)^{2} \right) \\
&= \frac{1}{m} \sum_{i=1}^{m} \frac{\partial}{\partial \hat{y}_{p}} \left( \left( \hat{y}_{i} - y_{i} \right)^{2} \right) \\
&= \frac{2}{m} \sum_{i=1}^{m} \left( \hat{y}_{i} - y_{i} \right) \frac{\partial \hat{y}_{i}}{\partial \hat{y}_{p}} \\
&= \frac{2}{m} \sum_{i=1}^{m} \left( \hat{y}_{i} - y_{i} \right) \delta_{ip} \\
&= \frac{2}{m} \left( \hat{y}_{p} - y_{p} \right)
\end{align}
$$

for all $p = 1, \ldots, m$.

The vectorized form 

$$
\frac{\partial \text{MSE}}{\partial \hat{\mathbf{y}}} = 
\frac{2}{m} \left( \hat{\mathbf{y}} - \mathbf{y} \right)
$$

In [9]:
# our derivative
our_dmse_dyp = 2 * (y_pred - Y) / len(Y)
our_dmse_dyp.shape

(20640,)

In [10]:
mape(our_dmse_dyp, dmse_dyp)

1.3953523214227685e-17

# Weighted sum derivative

## respect to bias

In [11]:
dyp_db = jacobian(weighted_sum, 0)(b, w)
dyp_db.shape

(20640,)

$$
\begin{align}
\frac{\partial \hat{y}_{p}}{\partial b} &= \frac{\partial}{\partial b} \left( b + \mathbf{x}_{p}^{\top} \mathbf{w} \right) \\
&= 1
\end{align}
$$

for all $p = 1, \ldots, m$.

The vectorized form is

$$
\frac{\partial \hat{\mathbf{y}}}{\partial b} = 
\mathbf{1}
$$

where $\mathbf{1} \in \mathbb{R}^{m}$.

In [12]:
our_dyp_db = np.ones_like(Y)
our_dyp_db.shape

(20640,)

In [13]:
mape(our_dyp_db, dyp_db)

0.0

## respect to weight

In [14]:
dyp_dw = jacobian(weighted_sum, 1)(b, w)
dyp_dw.shape

(20640, 8)

$$
\begin{align}
\frac{\partial \hat{y}_{p}}{\partial w_{q}} &= \frac{\partial}{\partial w_{q}} \left( b + \mathbf{x}_{p}^{\top} \mathbf{w} \right) \\
&= \frac{\partial}{\partial w_{q}} \left(\mathbf{x}_{p}^{\top} \mathbf{w} \right) \\
&= \frac{\partial}{\partial w_{q}} \left( x_{p1}w_{1} + \ldots + x_{pq}w_{q} + \ldots + x_{pn}w_{n} \right) \\
&= \frac{\partial}{\partial w_{q}} \left( x_{pk} w_{k} \right) \\
&= x_{pk} \delta_{kq} \\
&= x_{pq}
\end{align}
$$

for all $p = 1, \ldots, m$, and $q = 1, \ldots, n$.

Vectoring this for all $q = 1, \ldots, n$

$$
\frac{\partial \hat{y}_{p}}{\partial \mathbf{w}} = 
\mathbf{x}_{p}^{\top} \in \mathbb{R}^{1 \times n}
$$

Vectorizing for all $p = 1, \ldots, m$
$$
\frac{\partial \hat{\mathbf{y}}}{\partial \mathbf{w}} = 
\mathbf{X} \in \mathbb{R}^{m \times n}
$$

In [15]:
our_dyp_dw = X
our_dyp_dw.shape

(20640, 8)

In [16]:
mape(our_dyp_dw, dyp_dw)

0.0

# complete derivative

## respect to bias

In [17]:
dmse_db = jacobian(
    lambda b_: 
    mse(weighted_sum(b_, w))
, 0)(b)

dmse_db

array(-568.34171742)

$$
\begin{align}
\frac{\partial \text{MSE}}{\partial b} &= 
{\color{Cyan} \frac{\partial \text{MSE}}{\partial \hat{y}_{p}}}
{\color{Orange} \frac{\partial \hat{y}_{p}}{\partial b}} \\
&= {\color{Cyan} \frac{2}{m} \left( \hat{y}_{p} - y_{p} \right)}
{\color{Orange} 1_{p}} \\
&= \frac{2}{m} \left< \hat{\mathbf{y}} - \mathbf{y}, \mathbf{1} \right> \\
&= \frac{2}{m} \left( \hat{\mathbf{y}} - \mathbf{y} \right)^{\top} \mathbf{1}
\end{align}
$$

where $\mathbf{1} \in \mathbb{R}^{m}$.

In [18]:
our_dmse_db = 2 / len(Y) * np.sum(y_pred - Y)
our_dmse_db

np.float64(-568.3417174171765)

In [19]:
mape(our_dmse_db.item(), dmse_db.item())

0.0

***

Using einsum

In [20]:
mape(
    np.einsum('p,p->', our_dmse_dyp, our_dyp_db).item(),
    dmse_db.item()
)

0.0

## respect to weight

In [21]:
dmse_dw = jacobian(
    lambda w_: 
    mse(weighted_sum(b, w_))
, 0)(w)

dmse_dw.shape

(8,)

$$
\begin{align}
\frac{\partial \text{MSE}}{\partial w_{q}} &= 
{\color{Cyan} \frac{\partial \text{MSE}}{\partial \hat{y}_p}}
{\color{Orange} \frac{\partial \hat{y}_{p}}{\partial w_{q}}} \\
&= {\color{Cyan} \frac{2}{m} \left(\hat{y}_{p} - y_{p} \right)} {\color{Orange} x_{pq}} \\
&= \frac{2}{m} \left< \hat{\mathbf{y}} - \mathbf{y}, \mathbf{x}_{:,q} \right> \\
&= \frac{2}{m} \left( \mathbf{x}_{:,q} \right)^{\top} \left( \hat{\mathbf{y}} - \mathbf{y} \right)
\end{align}
$$

for all $q = 1, \ldots, n$, where 
$\mathbf{x}_{:,q} = \begin{bmatrix} x_{1q} & \cdots & x_{mq} \end{bmatrix}^{\top} \in \mathbb{R}^{m \times 1}$. 

Vectorized form

$$
\begin{align}
\frac{\partial \text{MSE}}{\partial \mathbf{w}} &= \frac{2}{m}
\mathbf{X}^{\top} \left( \hat{\mathbf{y}} - \mathbf{y} \right)
\end{align}
$$

In [22]:
our_dmse_dw = 2 / len(Y) * X.T @ (y_pred - Y)
our_dmse_dw.shape

(8,)

In [23]:
mape(our_dmse_dw, dmse_dw)

1.927565925720288e-17

***

Using einsum

In [24]:
our_dmse_dyp.shape

(20640,)

In [25]:
mape(
    np.einsum('p,pq->q', our_dmse_dyp, our_dyp_dw),
    dmse_dw
)

1.624831841201509e-15