In [1]:
import numpy as np

In [2]:
N = 2  # # of training examples in a mini-batch
D = 3  # data dimension
K = 2  # output dimension

# Model parameters
W1 = np.random.randn(D, 4)
b1 = np.zeros((1, 4))
W2 = np.random.randn(4, K)
b2 = np.zeros((1, K))

print(W1)
print(b1)
print(W2)
print(b2)

[[-0.62545022 -0.67073086 -2.56050732 -0.53648352]
 [ 0.09981772  0.70368708  0.1334016  -0.74292812]
 [ 0.96670241 -0.98243246  0.99848907 -0.80788105]]
[[ 0.  0.  0.  0.]]
[[-1.14306854 -1.10152559]
 [-0.60629853 -1.28553044]
 [ 0.34331156  1.47603916]
 [ 0.27811724  1.88962966]]
[[ 0.  0.]]


In [3]:
X = np.random.randn(N, D)  # data matrix
y = np.random.randn(2, K)  # target matrix

print(X)
print(y)

[[-0.15810992 -2.062161   -0.93076613]
 [-1.95015237  1.06429298 -1.15135076]]
[[-2.04671856  1.00793003]
 [ 0.1286509  -1.90473463]]


In [4]:
# Hidden layer
Y1 = X.dot(W1) + b1
H1 = np.maximum(Y1, 0)

# Output layer
y_pred = H1.dot(W2) + b2

# Loss function
L = 2 * np.mean(y_pred - y)

print(Y1)
print(H1)
print(y_pred)
print(L)

[[-1.00672418 -0.43065199 -0.79961377  2.36880907]
 [ 0.21294497  3.18808096  3.98574666  1.1856859 ]]
[[ 0.          0.          0.          2.36880907]
 [ 0.21294497  3.18808096  3.98574666  1.1856859 ]]
[[ 0.65880664  4.47617189]
 [-0.47822689  3.79068596]]
5.63115493033


In [5]:
# We add a "d" in front of each variable to denote the gradient
# We will backprop in the following order: 
#    dy_pred -> dH1 & dW2 & db2 -> dY1 -> dW1 & db1

# Output Layer
dy_pred = 2*(y_pred - y) 
dH1 = dy_pred.dot(W2.T) 
dW2 = H1.T.dot(dy_pred)
db2 = dy_pred.sum(axis=0)

# Hidden Layer
dY1 = dH1 * (H1 > 0)
dW1 = X.T.dot(dY1); 
db1 = dY1.sum(axis=0)

print(dy_pred)
print(dH1)
print(dW2)
print(db2)
print(dY1)
print(dW1)
print(db1)

[[  5.4110504    6.93648373]
 [ -1.21375558  11.39084117]]
[[-13.82591582 -12.19777289  12.0961978   14.61229182]
 [-11.15989728 -13.90737485  16.39663135  21.18690503]]
[[ -0.25846314   2.4256223 ]
 [ -3.86955105  36.31492391]
 [ -4.83772224  45.40100718]
 [ 11.3786124   29.93716534]]
[  4.19729482  18.3273249 ]
[[ -0.          -0.           0.          14.61229182]
 [-11.15989728 -13.90737485  16.39663135  21.18690503]]
[[ 21.76350012  27.1215     -31.97592946 -43.62804136]
 [-11.8774003  -14.80152138  17.45081959  -7.58382404]
 [ 12.84895626  16.01226666 -18.87827404 -37.99418556]]
[-11.15989728 -13.90737485  16.39663135  35.79919684]


In [6]:
# Parameter update
lr = 1e-6  # learning rate

W1 -= lr * dW1
b1 -= lr * db1
W2 -= lr * dW2
b2 -= lr * db2

print(W1)
print(b1)
print(W2)
print(b2)

[[-0.62547198 -0.67075798 -2.56047535 -0.53643989]
 [ 0.0998296   0.70370188  0.13338415 -0.74292053]
 [ 0.96668956 -0.98244848  0.99850794 -0.80784306]]
[[  1.11598973e-05   1.39073748e-05  -1.63966313e-05  -3.57991968e-05]]
[[-1.14306828 -1.10152802]
 [-0.60629466 -1.28556676]
 [ 0.3433164   1.47599376]
 [ 0.27810586  1.88959973]]
[[ -4.19729482e-06  -1.83273249e-05]]
