In [2]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# 1000 -> 100 -> 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

print('x shape is: ', x.shape)
print('y shape is: ', y.shape)
print('w1 shape is:', w1.shape)
print('w2 shape is', w2.shape)

x shape is:  (64, 1000)
y shape is:  (64, 10)
w1 shape is: (1000, 100)
w2 shape is (100, 10)


In [7]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    # x multiply w
    h_relu = np.maximum(h, 0)
    # passing through a activation function, in this case, it is relu
    # relu means y=x when x>0 and y=0 elsewhere
    y_pred = h_relu.dot(w2)
    # entering the next layer

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    # here it is a square loss
    if t % 100 == 0:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y) # gradient of the last layer
    grad_w2 = h_relu.T.dot(grad_y_pred) # get the gradient of w2
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h) # get the gradient of w1

    if t == 499:
        print('shape of grad_y_pred: ', grad_y_pred.shape)
        print('shape of grad_w2', grad_w2.shape)
        print('shape of grad_h_relu', grad_h_relu.shape)
        print('shape of grad_w1: ', grad_w1.shape)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2


0 1.5979496282193063e-23
100 1.3188316470774099e-23
200 1.0746238008581897e-23
300 9.280000428139478e-24
400 8.137797103859909e-24
shape of grad_y_pred:  (64, 10)
shape of grad_w2 (100, 10)
shape of grad_h_relu (64, 100)
shape of grad_w1:  (1000, 100)
