In [1]:
# Written by Charles Zhang, Justin Cohen, Tiffany Gao

import numpy as np
import matplotlib.pyplot as plt
import copy

# Normalization Function
# Sigmoid also acceptable
def relu(x): 
	return x * (x > 0)
def sigmoid(x):
	return ( 1 / (1 + 2.7183 ** (-x)) )

N = 2
T = 20000
learning_rate = 0.5
init_factor = 1


def build_parameters(N, init_factor, input_size, output_size):
	W1 = ( np.random.rand(N, input_size) - 0.2 ) * init_factor # Weights of first layer (from input to hidden)
	b1 = ( np.random.rand(N) - 0.0 ) * init_factor # Biases of hidden layer
	W2 = ( np.random.rand(output_size, N) - 0.2 ) * init_factor # Weights of second layer (from hidden to output)
	b2 = ( np.random.rand(output_size) - 0.0 ) * init_factor # Biases of output layer
	return {
		'W1': W1,
		'b1': b1,
		'W2': W2,
		'b2': b2
	}


def forward_pass(parameters, x, input_size, N, output_size):
	W1 = parameters['W1']
	b1 = parameters['b1']
	W2 = parameters['W2']
	b2 = parameters['b2']
	
	o1 = np.zeros([N])
	for i in range(N):
		for j in range(input_size):
			o1[i] += x[j] * W1[i][j]
		o1[i] += b1[0]
	
	h = copy.deepcopy(o1)
	for i in range(N): # For each hidden layer neuron
		h[i] = sigmoid(o1[i]) # This should equal to activation values for our hidden layer
		
	o2 = np.zeros([output_size])
	for i in range(output_size):
		for j in range(N):
			o2[i] += h[j] * W2[i][j]
		o2[i] += b2[0]
	
	y = copy.deepcopy(o2)
	for i in range(len(o2)): # For each output layer neuron
		y[i] = relu(sigmoid(o2[i])) # This should equal the activation values for our output layer
		
		
	print("x: {}".format(x))
	print("hidden: {}".format(h))
	print("y: {}".format(y))
	return {
		'x': x,
		'o1': o1,
		'h': h,
		'o2': o2,
		'y': y
	}


def calcGrads(parameters, activations, y, gradients, input_size, N, output_size):
	error = activations['y'] - y # How much greater is what we got than what we should have gotten?
	hyperlambda = regularization_term(parameters)
	hyperlambda = 0
	o2_cutoff = activations['o2'] >= 0 # No gradient if output is negative
	# In general, these two evaluate to true (1)
	
	for i in range(0, output_size): # Only one output node for our examples; i = 0
		for j in range(0, N): # for each weight from hidden to output
			gradients['dW2'][i][j] += ( 2 * hyperlambda * parameters['W2'][i][j] ) + ( o2_cutoff * 2 * error * activations['h'][j] ) # calculus
			
	for i in range(0, output_size): # for each output level bias (there's one)
		gradients['db2'][i] += ( 2 * hyperlambda * parameters['b2'][i] ) + ( o2_cutoff * 2 * error ) # calculus
		
	for i in range(0, N):
		for j in range(0, input_size): # for each weight from input to hidden
			o1_cutoff = activations['o1'][j] >= 0
			o1_cutoff *= o2_cutoff # Gradient iff input hidden layer and output both activate
			gradients['dW1'][i][j] +=  ( 2 * hyperlambda * parameters['W1'][i][j] ) + ( o1_cutoff * 2 * error * parameters['W2'][0][i] * activations['x'][j] ) # calculus
			
	for i in range(0, N): # for each hidden layer bias
		o1_cutoff = activations['o1'][j] >= 0
		o1_cutoff *= o2_cutoff # Gradient iff input hidden layer and output both activate
		gradients['db1'] += ( 2 * hyperlambda * parameters['b1'][i] ) + ( o1_cutoff * 2 * error * parameters['W2'][0][i] ) # calculus

def regularization_term(parameters, lmbd=.01):
	term = 0
	for p in parameters:
		term += np.linalg.norm(parameters[p])
	return lmbd * term


def update_parameters(parameters, gradients, learning_rate):
	parameters['W1'] = parameters['W1'] - learning_rate * gradients['dW1']
	parameters['b1'] = parameters['b1'] - learning_rate * gradients['db1']
	parameters['W2'] = parameters['W2'] - learning_rate * gradients['dW2']
	parameters['b2'] = parameters['b2'] - learning_rate * gradients['db2']


def zero_gradients(N, input_size, output_size):
	return {
		'dW1': np.zeros([N, input_size]),
		'db1': np.zeros([N]),
		'dW2': np.zeros([output_size, N]),
		'db2': np.zeros([output_size])
	}


def run_xor_network(data):
	parameters = build_parameters(2, init_factor, 2, 1)

	for t in range(T):
		cost = 0
		gradients = zero_gradients(2, 2, 1)
		for io_pair in data:
			x, y = io_pair
			activations = forward_pass(parameters, x, 2, 2, 1)
			cost += (y - activations['y']) ** 2 # Square of diff between activation value and desired outcome
			cost += regularization_term(parameters) # Add very small regularization term
			calcGrads(parameters, activations, y, gradients, 2, 2, 1) # Calculate the gradient such that we may step in that direction
		cost /= len(data) # Average it out
		if t % 10 == 0:
			print('step {}: cost = {}'.format(t, cost))
		#for grad in gradients:
		#	gradients[grad] /= len(data) # Gradient is average of what every test case "wants"
		update_parameters(parameters, gradients, learning_rate)
		print(parameters['W2'])
		print(parameters['b2'])
		print(parameters['W1'])
		print(parameters['b1'])


xor_data = [
	(np.array([0, 0]), 0),
	(np.array([1, 0]), 1),
	(np.array([0, 1]), 1),
	(np.array([1, 1]), 0)
]

run_xor_network(xor_data)


x: [0 0]
hidden: [ 0.57882758  0.57882758]
y: [ 0.74289985]
x: [1 0]
hidden: [ 0.59307365  0.65168655]
y: [ 0.75081656]
x: [0 1]
hidden: [ 0.63132659  0.73969593]
y: [ 0.76229345]
x: [1 1]
hidden: [ 0.64488634  0.79460126]
y: [ 0.7682449]
step 0: cost = [ 0.34071813]
[[-0.06980691 -0.23654485]]
[-0.55558151]
[[-0.23079847 -0.07597539]
 [ 0.06677249  0.47935284]]
[-0.73033834 -0.61469668]
x: [0 0]
hidden: [ 0.32511942  0.32511942]
y: [ 0.34182166]
x: [1 0]
hidden: [ 0.27664936  0.33993805]
y: [ 0.34179427]
x: [0 1]
hidden: [ 0.30867542  0.43758054]
y: [ 0.33611822]
x: [1 1]
hidden: [ 0.26170624  0.45407624]
y: [ 0.33597917]
[[-0.06980691 -0.23654485]]
[-0.55558151]
[[-0.23079847 -0.07597539]
 [ 0.06677249  0.47935284]]
[-0.73033834 -0.61469668]
x: [0 0]
hidden: [ 0.32511942  0.32511942]
y: [ 0.34182166]
x: [1 0]
hidden: [ 0.27664936  0.33993805]
y: [ 0.34179427]
x: [0 1]
hidden: [ 0.30867542  0.43758054]
y: [ 0.33611822]
x: [1 1]
hidden: [ 0.26170624  0.45407624]
y: [ 0.33597917]
[[-0.0

KeyboardInterrupt: 

In [2]:
import numpy as np
import math

def relu(x): return x * (x > 0)
def sigmoid(x):
	return 1 / (1 + math.exp(-x))
def d_sigmoid(x):
	return sigmoid(x) * sigmoid(1 - x)

N = 2
T = 50000
learning_rate = .5
init_factor = .5


def build_parameters(N, init_factor=2, input_size=2, output_size=1):
	W1 = np.random.rand(N, input_size) * init_factor 
	b1 = np.random.rand(N) * init_factor
	W2 = np.random.rand(output_size, N) * init_factor
	b2 = np.random.rand(output_size) * init_factor
	return {
		'W1': W1,
		'b1': b1,
		'W2': W2,
		'b2': b2
	}


def forward_pass(parameters, x):
	W1 = parameters['W1']
	b1 = parameters['b1']
	W2 = parameters['W2']
	b2 = parameters['b2']
	o1 = np.matmul(W1, x) + b1
	h = relu(o1)
	o2 = np.matmul(W2, h) + b2
	y = sigmoid(o2)
	
	print("x: {}".format(x))
	print("hidden: {}".format(h))
	print("y: {}".format(y))
	
	return {
		'x': x,
		'o1': o1,
		'h': h,
		'o2': o2,
		'y': y
	}


def backpropagate(parameters, activations, y, gradients, lmbd):
	error = activations['y'] - y
	o2_cutoff = activations['o2'] >= 0
	o1_cutoff = activations['o1'] >= 0
	gradients['dW2'] = (2 * lmbd * parameters['W2']) + gradients['dW2'] + 2 * error * activations['h'] * d_sigmoid(activations['o2'])
	gradients['db2'] = (2 * lmbd * parameters['b2']) + gradients['db2'] + 2 * error * d_sigmoid(activations['o2'])
	gradients['dW1'] = (2 * lmbd * parameters['W1']) + gradients['dW1'] + (2 * error * (np.matmul(parameters['W2'].T, activations['x'].reshape(1,2))).T * o1_cutoff).T
	gradients['db1'] = (2 * lmbd * parameters['b1']) + gradients['db1'] + 2 * error * parameters['W2'][0] * o1_cutoff


def regularization_term(parameters, lmbd=.01):
	term = 0
	for p in parameters:
		term += np.linalg.norm(parameters[p])
	return lmbd * term


def update_parameters(parameters, gradients, learning_rate):
	parameters['W1'] = parameters['W1'] - learning_rate * gradients['dW1']
	parameters['b1'] = parameters['b1'] - learning_rate * gradients['db1']
	parameters['W2'] = parameters['W2'] - learning_rate * gradients['dW2']
	parameters['b2'] = parameters['b2'] - learning_rate * gradients['db2'] 


def zero_gradients(N, input_size=2, output_size=1):
	return {
		'dW1': np.zeros([N, input_size]),
		'db1': np.zeros([N]),
		'dW2': np.zeros([output_size, N]),
		'db2': np.zeros([output_size])
	}


def run_xor_network(data):
	parameters = build_parameters(2)
	lmbd = 0.0

	for t in range(T):
		cost = 0
		gradients = zero_gradients(2)
		for io_pair in data:
			x, y = io_pair
			activations = forward_pass(parameters, x)
			cost += (y - activations['y']) ** 2 + regularization_term(parameters, lmbd)
			backpropagate(parameters, activations, y, gradients, lmbd)
		cost /= len(data)
		if t % 10 == 0:
			print('step {}: cost = {}'.format(t, cost))
			print(activations['y'])
		for grad in gradients:
			gradients[grad] /= len(data)
		update_parameters(parameters, gradients, learning_rate)

	for io_pair in data:
		x, y = io_pair
		activations = forward_pass(parameters, x)
		print(str(x) + ':' + str(activations['y']))

xor_data = [
	(np.array([0, 0]), 0),
	(np.array([1, 0]), 1),
	(np.array([0, 1]), 1),
	(np.array([1, 1]), 0)
]



run_xor_network(xor_data)

x: [0 0]
hidden: [ 1.05580423  1.966956  ]
y: 0.9832289857326498
x: [1 0]
hidden: [ 1.233212    3.32744689]
y: 0.995586197114004
x: [0 1]
hidden: [ 2.52205664  2.61342246]
y: 0.9986755451181465
x: [1 1]
hidden: [ 2.69946441  3.97391335]
y: 0.9996554180776759
step 0: cost = 0.4915178572783897
0.9996554180776759
x: [0 0]
hidden: [ 0.37122266  1.56669305]
y: 0.939867493450209
x: [1 0]
hidden: [ 0.20403068  2.72570246]
y: 0.9687118022721181
x: [0 1]
hidden: [ 1.49180565  2.0110526 ]
y: 0.9904148397240555
x: [1 1]
hidden: [ 1.32461367  3.17006201]
y: 0.9951380682402066
x: [0 0]
hidden: [-0.          1.19362878]
y: 0.8628241757155048
x: [1 0]
hidden: [-0.          2.16280035]
y: 0.9272084726729289
x: [0 1]
hidden: [ 0.50330327  1.44387591]
y: 0.9372651838447071
x: [1 1]
hidden: [ 0.00525294  2.41304749]
y: 0.9389979270000185
x: [0 0]
hidden: [-0.          0.89033092]
y: 0.8040762550301349
x: [1 0]
hidden: [-0.          1.70183628]
y: 0.8689243263990887
x: [0 1]
hidden: [-0.          0.981081

KeyboardInterrupt: 