In [1]:
import numpy as np
import scipy as sp
from sklearn import datasets
import mxnet as mx
mnist = mx.test_utils.get_mnist()

In [4]:
x = mnist['train_data']
x = x.reshape(x.shape[0], -1)
y = mnist['train_label']

# Simple NN

In [14]:
# Weights must be of the format (in, hidden)
shapes = [784, 100, 10]
W = [np.random.randn(shapes[i], shapes[i+1]) for i in range(len(shapes)-1)]

def affine(x, w):
    """
    x : (n, m)
    w : (m, k)
    out : (n, k)
    """
    return x.dot(w)

def affine_(d, x, w):
    """
    x : (n, m)
    w : (m, k)
    d : (n, k)
    ---
    dw : (m, k)
    dx : (n, m)
    """
    return {'x': d.dot(w.T), 'w': x.T.dot(d)}

def sigmoid(x):
    """
    x : (n, m)
    out : (n, m)
    """
    return sp.special.expit(x)

def sigmoid_(d, x):
    """
    x : (n, m)
    d : (n, m)
    ---
    dx : (n, m)
    """
    sigm = sp.special.expit(x)
    return d * (sigm * (1-sigm))

def softmax_ce(x, y):
    """
    x : (n, m)
    y : (n,)
    out : () [scalar]
    
    Equation is 1/n * \sum_i^n [ -log(e^x_{y_i} / \sum_j e^x_j) ]
    which is equivalently:
        
        1/n * \sum_i^n log(\sum_j e^x_j) - x_{y_i}
    """
    n = x.shape[0]
    exp = np.exp(x)
    # denominator in original expression, after log
    denom = np.log(np.sum(exp, axis=1))
    return np.sum(denom - x[np.arange(n), y]) / n

def softmax_ce_(x, y):
    """
    x : (n, m)
    y : (n,)
    ---
    dx : (n, m)
    
    Back propagation for a single data point is:
    
    dL_i/dx_{ik} = -1_{k == y_i} + softmax(x_i)_k
    
    thus, for all data points:
    dL/dx_k = 1/n * \sum_i -1_{k == y_i} + softmax(x_i)_k
    """
    n = x.shape[0]
    exp = np.exp(x)
    softmax = exp / np.expand_dims(np.sum(exp, axis=1), 1)
    softmax[np.arange(n), y] -= 1
    return softmax / n

  """
  """


In [25]:
iterations = 10
for i in range(iterations):
    # Forward pass
    o1 = affine(x, W[0])
    o2 = sigmoid(o1)
    o3 = affine(o2, W[1])
    loss = softmax_ce(o3, y)
    print("loss at {} : {}".format(i, loss))
    # Backward pass
    d = softmax_ce_(o3, y)
    d = affine_(d, o2, W[1])
    dw1 = d['w']
    d = d['x']
    d = sigmoid_(d, o1)
    d = affine_(d, x, W[0])
    dw0 = d['w']
    W[0] -= dw0 * 10
    W[1] -= dw1 * 10

loss at 0 : 0.8513821343430952
loss at 1 : 0.8882258889421258
loss at 2 : 0.8012652944963672
loss at 3 : 0.8246505679153368
loss at 4 : 0.755178812539145
loss at 5 : 0.7695421554257007
loss at 6 : 0.7139919293885276
loss at 7 : 0.7211841141563288
loss at 8 : 0.67806170978656
loss at 9 : 0.6801962994386499


In [27]:
print(np.argmax(o3, axis=1)[:10], y[:10])

[5 0 4 1 9 2 1 3 1 4] [5 0 4 1 9 2 1 3 1 4]


# Simple binary NN