In [3]:
"""
Simplistic implementation of the two-layer neural network.
Training method is stochastic (online) gradient descent with momentum.

As an example it computes XOR for given input.

Some details:
- tanh activation for hidden layer
- sigmoid activation for output layer
- cross-entropy loss

Less than 100 lines of active code.

"""

import numpy as np
import time

n_hidden = 10
n_in = 10
n_out = 10
n_samples = 300

learning_rate = 0.01
momentum = 0.9

np.random.seed(0)


In [6]:

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def tanh_prime(x):
    return  1 - np.tanh(x)**2

def train(x, t, V, W, bv, bw):

    # forward
    A = np.dot(x, V) + bv
    Z = np.tanh(A)

    B = np.dot(Z, W) + bw
    Y = sigmoid(B)

    # backward
    Ew = Y - t
    Ev = tanh_prime(A) * np.dot(W, Ew)

    dW = np.outer(Z, Ew)
    dV = np.outer(x, Ev)

    loss = -np.mean ( t * np.log(Y) + (1 - t) * np.log(1 - Y) )

    # Note that we use error for each layer as a gradient
    # for biases

    return  loss, (dV, dW, Ev, Ew)

def predict(x, V, W, bv, bw):
    A = np.dot(x, V) + bv
    B = np.dot(np.tanh(A), W) + bw
    return (sigmoid(B) > 0.5).astype(int)


In [28]:
# Setup initial parameters
# Note that initialization is cruxial for first-order methods!

V = np.random.normal(scale=0.1, size=(n_in, n_hidden))
W = np.random.normal(scale=0.1, size=(n_hidden, n_out))

bv = np.zeros(n_hidden)
bw = np.zeros(n_out)

params = [V,W,bv,bw]

# Generate some data

X = np.random.binomial(1, 0.5, (n_samples, n_in))
T = X ^ 1

# Train
for epoch in range(100):
    err = []
    upd = [0]*len(params)

    t0 = time.clock()
    for i in range(X.shape[0]):
        loss, grad = train(X[i], T[i], *params)

        for j in range(len(params)):
            params[j] -= upd[j]

        for j in range(len(params)):
            upd[j] = learning_rate * grad[j] + momentum * upd[j]

        err.append( loss )

    print("Epoch: %d, Loss: %.8f, Time: %.4f" % (epoch, np.mean( err ), time.clock()-t0 ))

# Try to predict something

x = np.random.binomial(1, 0.5, n_in)




Epoch: 0, Loss: 0.45465070, Time: 0.0253
Epoch: 1, Loss: 0.13697961, Time: 0.0350
Epoch: 2, Loss: 0.06206941, Time: 0.0316
Epoch: 3, Loss: 0.04092746, Time: 0.0313
Epoch: 4, Loss: 0.03159958, Time: 0.0339
Epoch: 5, Loss: 0.02592744, Time: 0.0331
Epoch: 6, Loss: 0.02199575, Time: 0.0308
Epoch: 7, Loss: 0.01907812, Time: 0.0308
Epoch: 8, Loss: 0.01682099, Time: 0.0302
Epoch: 9, Loss: 0.01502363, Time: 0.0303
Epoch: 10, Loss: 0.01356039, Time: 0.0327
Epoch: 11, Loss: 0.01234775, Time: 0.0342
Epoch: 12, Loss: 0.01132776, Time: 0.0339
Epoch: 13, Loss: 0.01045887, Time: 0.0335
Epoch: 14, Loss: 0.00971052, Time: 0.0330
Epoch: 15, Loss: 0.00905971, Time: 0.0330
Epoch: 16, Loss: 0.00848887, Time: 0.0335
Epoch: 17, Loss: 0.00798436, Time: 0.0317
Epoch: 18, Loss: 0.00753542, Time: 0.0307
Epoch: 19, Loss: 0.00713347, Time: 0.0325
Epoch: 20, Loss: 0.00677160, Time: 0.0339
Epoch: 21, Loss: 0.00644415, Time: 0.0337
Epoch: 22, Loss: 0.00614650, Time: 0.0324
Epoch: 23, Loss: 0.00587477, Time: 0.0331
Ep

In [30]:
print("XOR prediction:")
print(x)
print(predict(x, *params))  

XOR prediction:
[1 0 1 1 1 1 0 1 0 0]
[0 1 0 0 0 0 1 0 1 1]


tuple