In [108]:
# adapted from https://iamtrask.github.io/2015/07/27/python-network-part2/
import numpy as np # linear algebra

# Simple Gradient Descent Perceptron
![](http://mblogthumb2.phinf.naver.net/MjAxNzAzMjNfMjI4/MDAxNDkwMjI3NzU2MTAw.CnHFk7Sv7sxrQ7LPj3MiIkXfLeVslL04pNQZ0aEAjcMg.54JHq2BlIqLZKjik-Q1MyKheooaPZ9yF-bBVvKSu73Ug.PNG.htk1019/image.png?type=w800)We will be using a very rudimentary version of gradient descent to see a basic classification perceptron modeled as the picture above to classify a *very* simple dataset.

$\theta$ above is equal to  $\sum_{i=1}^{2} x_i\cdot w_i$

In [142]:
# compute sigmoid nonlinearity
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

In [143]:
#using derivative formula from https://theclevermachine.wordpress.com/2014/09/08/derivation-derivatives-for-common-neural-network-activation-functions/
# convert OUTPUT of sigmoid function to its derivative
def sigmoid_derivative(sigmoid_output):
    return sigmoid_output * (1-sigmoid_output)

In [144]:
# input dataset. Think about a classification perceptron using only 0 and 1 as inputs
X = np.array([[0,1],[0,1],[1,0],[1,0]])

In [145]:
# output dataset. The bias alludes that having a 1 on the first value gives a 1 output 
y = np.array([[0,0,1,1]]).T

In [146]:
# seed random numbers to make calculation
# deterministic (For consistent results)
np.random.seed(1)

In [147]:
# initialize 2 weights randomly with mean 0
synapse_0 = 2*np.random.random((2,1)) - 1
print('original synapse weights\n',synapse_0)

original synapse weights
 [[-0.16595599]
 [ 0.44064899]]


In [148]:
layer_0 = X
#[[-0.165*(0)+0.44*(1)] = [0.44]. Sigmoid(0.44) = 0.60
#[-0.165*(0)+0.44*(1)] = [0.44]. Sigmoid(0.44) = 0.60
#[-0.165*(0)+0.44*(1)] = [-0.165]. Sigmoid(-0.165) = -0.54
#[-0.165*(0)+0.44*(1)]] = [-0.165]. Sigmoid(-0.165) = -0.54
layer_1 = sigmoid(np.dot(layer_0,synapse_0))
layer_1_error = layer_1 - y
print('our layer 1 predictions\n',layer_1,'\n our layer 1 error compared to actual outputs\n', layer_1_error)

our layer 1 predictions
 [[0.60841366]
 [0.60841366]
 [0.45860596]
 [0.45860596]] 
 our layer 1 error compared to actual outputs
 [[ 0.60841366]
 [ 0.60841366]
 [-0.54139404]
 [-0.54139404]]


In [149]:
# Multiply how much we missed by in layer_1  with the slope of the sigmoid values in layer_1. This helps us figure out our "descent"
layer_1_delta = layer_1_error * sigmoid_derivative(layer_1)
synapse_derivative = np.dot(layer_0.T, layer_1_delta)

print('how much we should adjust our synapse values by\n',synapse_derivative)
synapse_0 -= synapse_derivative
print('new synapse value\n',synapse_0)

how much we should adjust our synapse values by
 [[-0.2688417 ]
 [ 0.28990482]]
new synapse value
 [[0.10288571]
 [0.15074416]]


In [150]:
for iter in range(10000):

    # forward propagation
    layer_0 = X
    layer_1 = sigmoid(np.dot(layer_0,synapse_0))

    # how much did we miss?
    layer_1_error = layer_1 - y
    # multiply how much we missed by the 
    # slope of the sigmoid at the values in l1
    layer_1_delta = layer_1_error * sigmoid_derivative(layer_1)
    synapse_0_derivative = np.dot(layer_0.T,layer_1_delta)

    # update weights
    synapse_0 -= synapse_0_derivative

print ("Output After Training:")
# Remember these values are called by the sigmoid function. 
# The asymtote at y=1 for the sigmoid function is why our synapse weights keep growing but the prediction never quite gets perfect
print ('Synapse weights\n',synapse_0)
print ('Synapse predictions\n',layer_1)

Output After Training:
Synapse weights
 [[ 5.28321694]
 [-5.28316892]]
Synapse predictions
 [[0.00505094]
 [0.00505094]
 [0.99494931]
 [0.99494931]]
