In [1]:
import numpy as np

In [2]:
rng = np.random.default_rng(1000)

## Dive in math
---
from norm formula

$$ LN(x) = \frac{x - \mu}{\sigma + \epsilon} \gamma + \beta$$

from the cross-entropy and softmax gradient we got

$$dZ = \frac {\partial \mathcal {L}} {\partial Z} = \frac {\partial \mathcal {L}}{\partial P} \frac {\partial P} {\partial Z} = \frac{P - Y} {N}$$

where 
* $Z$ is logits
* $P$ is $softmax\left(Z\right)$
* $N$ is token count in loss after mask
* $\mathcal {L}$ is loss function cross entropy

at the LM Head we got

$$Z = h\ @\ W_{vocab} + b_{vocab}$$

where 
* $h \in \mathbb{R} ^ {B \times T \times d_{model}}$
* $W \in \mathbb{R} ^ {d_model \times V}$
* $b \in \mathbb{R} ^ {V}$

$$\frac {\partial \mathcal {L}}{\partial W} = dW = \sum_{b, t} h_{b,t}^\intercal dZ_{b, t}$$

$$\frac {\partial \mathcal {L}}{\partial b} = db = \sum_{b, t} dZ_{b, t}$$

and $dh$

$$ \frac {\partial \mathcal {L}} {\partial h} = \frac {\partial \mathcal {L}} {\partial Z} \cdot \frac {\partial Z} {\partial h} = dh$$

$$dh_{b, t} = dZ_{b, t} W^\intercal$$

We will start from here ->

$$h = LN(y)$$
when $y$ is an output matrix from transformer stack



$$LN(\mathbf{X}) = (\hat{x} \odot \gamma) \oplus \beta$$

where 

$$\hat{x} = \frac {\mathbf{X} - \mu} {\sqrt{\sigma^2 + \epsilon}}$$

What is $\odot$ operator do?

give 
$$\mathbf{A} = \begin{bmatrix} a_{1, 1} & a_{1, 2} \\ a_{2, 1} & a_{2, 2} \end{bmatrix}$$
and
$$\mathbf{B} = \begin{bmatrix} b_{1, 1} \\ b_{2, 1} \end{bmatrix}$$
then 
$$ \mathbf{A} \odot \mathbf{B} = \begin{bmatrix} 
    a_{1, 1} \times b_{1, 1} & a_{1, 2} \times b_{1, 1}\\
    a_{2, 1} \times b_{2, 1} & a_{2, 2} \times b_{2, 1}
    \end{bmatrix}$$

In [3]:
B = 2
V = 20
d_model = 6
T = 4

In [4]:
X = rng.random((T, d_model), np.float64);
print(X)

[[0.52138574 0.60384185 0.4709418  0.20324794 0.52875903 0.19103628]
 [0.2815456  0.75368155 0.55167178 0.86372208 0.80537222 0.24837266]
 [0.18985741 0.98399558 0.66999717 0.28038283 0.20391323 0.62506469]
 [0.65260432 0.89880753 0.97476378 0.15393237 0.69908928 0.44724145]]


In [None]:
gamma = rng.random((d_model), np.float64);
beta = rng.random((d_model), np.float64);
print("gamma =>", gamma)
print("beta =>", beta)

gamma => [0.23502814 0.65818858 0.51673102 0.82385723 0.18965801 0.98047955]
beta => [0.39284504 0.45345328 0.27428462 0.3227665  0.62741705 0.43136525]


บางกรณีจะมีการแปลงเวคเตอร์ $\gamma$ เป็น diagonal matrix ตามด้านล่างเพื่อให้คูณได้ตามวิธีมาตรฐานของ math

In [19]:
gamma_diag = np.diag(gamma)
print(gamma_diag)

[[0.23502814 0.         0.         0.         0.         0.        ]
 [0.         0.65818858 0.         0.         0.         0.        ]
 [0.         0.         0.51673102 0.         0.         0.        ]
 [0.         0.         0.         0.82385723 0.         0.        ]
 [0.         0.         0.         0.         0.18965801 0.        ]
 [0.         0.         0.         0.         0.         0.98047955]]


In [21]:
print(X @ gamma_diag)

[[0.12254032 0.39744181 0.24335024 0.16744729 0.10028339 0.18730717]
 [0.06617114 0.49606459 0.28506592 0.71158368 0.15274529 0.24352432]
 [0.04462183 0.64765466 0.34620832 0.23099542 0.03867378 0.61286314]
 [0.15338038 0.59158485 0.50369068 0.1268183  0.13258788 0.43851109]]


แต่ใน Python เราสามารถใช้เครื่องหมาย * ในการคูณตำแหน่งต่อตำแหน่งได้เลย (ฺBroadcasting)

$\mathbf{X} \odot \gamma = $

In [15]:
print(X * gamma) 

[[0.12254032 0.39744181 0.24335024 0.16744729 0.10028339 0.18730717]
 [0.06617114 0.49606459 0.28506592 0.71158368 0.15274529 0.24352432]
 [0.04462183 0.64765466 0.34620832 0.23099542 0.03867378 0.61286314]
 [0.15338038 0.59158485 0.50369068 0.1268183  0.13258788 0.43851109]]


In [16]:
print(X * gamma + beta)

[[0.51538536 0.85089509 0.51763486 0.49021379 0.72770043 0.61867241]
 [0.45901618 0.94951788 0.55935054 1.03435018 0.78016234 0.67488957]
 [0.43746688 1.10110794 0.62049294 0.55376192 0.66609082 1.04422839]
 [0.54622542 1.04503814 0.77797531 0.44958479 0.76000493 0.86987634]]
