In [None]:
import numpy as np
import matplotlib.pyplot as plt
import itertools as it


# common activation functions and their derivatives
def id(x):
    return x

def id_deriv(x):
    return float(1)

def sigmoid(x):
    y = np.zeros_like(x)
    idx = x > -700
    y[idx] = 1 / (1 + np.exp(-1*x[idx]))
    return y

def sigmoid_deriv(x):
    y = sigmoid(x)
    return y * (1 - y)



rnd = np.random.RandomState(1)

## example from https://mattmazur.com/2015/03/17/a-step-by-step-backpropagation-example/, use sigmoid, loss=(e**2)/2 in loss, loss_deriv=e
feat = np.array([.05,.1]).reshape(1,-1)
true = np.array([.01,.99]).reshape(1,-1)

n, p = feat.shape
m, q = true.shape
if m != n:
    raise Exception('feat and true must have the same number of rows')


hidden_nodes = [2]
max_steps = 1
learn_rate = 0.5

nodes = [p] + hidden_nodes + [q]
layers = len(nodes)

## set activation functions
activation = [sigmoid for l in range(layers)]
activation_deriv = [sigmoid_deriv for l in range(layers)]
activation[0] = id
activation_deriv[0] = id_deriv
# activation[-1] = id
# activation_deriv[-1] = id_deriv

## Usually should scale data to (0,1) during preprocessing.  Skip for now for simplicity.
feat_sc = feat.copy()
true_sc = true.copy()


## pre-allocate
## X[h,i,j] = input to node j of layer h for observation i; X[h] has shape n x nodes[h]
X = [np.zeros(shape=[n,p]) for p in nodes]

## Y[h,i,j] = output from node j of layer h for observation i; Y[h] has shape n x nodes[h]
Y = X.copy()

## DLDX[h,i,j] = partial derivative of loss wrt X[h,i,j]; DLDX[h] has shape n x nodes[h]
DLDX = X.copy()

## DLDY[h,i,j] = partial derivative of loss wrt Y[h,i,j]; DLDY[h] has shape n x nodes[h]
DLDY = X.copy()

## B[h,j] = bias into node j of layer h; B[h] has shape nodes[h]
B = [rnd.rand(p) for p in nodes]
B[0] *= 0.0

## DLDB[h,i,j] = partial derivative of loss wrt B[h,j]; DLDB[h] has shape nodes[h]
DLDB = [np.zeros_like(b) for b in B]

## W_sh[h] = (nodes[h], nodes[h+1])
W_sh = [(i,o) for (i,o) in zip(nodes[:-1],nodes[1:])]

## W[h,j,k] = weight of edge from node j of layer h to node k of layer h+1; W[h] has shape nodes[h] x nodes[h+1]
W = [rnd.rand(*ws) for ws in W_sh]

## DLDW[h,j,k] = partial derivative of loss wrt W[h,j,k]; DLDW[h] has shape nodes[h] x nodes[h+1]
DLDW = [np.zeros_like(w) for w in W]


## example from https://mattmazur.com/2015/03/17/a-step-by-step-backpropagation-example/, use sigmoid, loss=(e**2)/2 in loss, loss_deriv=e
W = [np.array([[0.15, 0.25], [0.20, 0.30]]), np.array([[0.40, 0.50], [0.45, 0.55]])]
B = [np.array([0.0, 0.0]), np.array([0.35, 0.35]), np.array([0.60, 0.60])]


X[0] = feat_sc
for step in range(max_steps):
    forward_propagate()
    print(Y)
    e, L = error()    
    if L.max() < 1e-4:
        break
    backward_propagate()
    descend_gradient()
pred_sc = Y[-1].copy()
for (w,x,y) in zip(W,X,Y):
    print()
    print(x)
    print()
    print(w)
    print()
    print(y)
    print()
print(Y[-1])