## Create training data

In [2]:
import numpy as np

X = np.array([[0,0,1],
            [0,1,1],
            [1,0,1],
            [1,1,1]])
                
y = np.array([[0],
              [1],
              [1],
              [0]])

np.random.seed(1)

## Create non-linear neuron

In [3]:
def non_linear(x, deriv = False):
    if(deriv):
        return x*(1-x)
    return 1/(1+np.exp(-x))

## Randomize weights

In [4]:
weights0 = 2*np.random.rand(3,4) - 1 # Connects 3 inputs to a hidden layer with 4 neurons
weights1 = 2*np.random.rand(4,1) - 1 # Connects 4 neurons to 1 output neuron

In [5]:
weights0 # Each row corresponds to 1 input, each column corresponds to 1 neuron

array([[-0.16595599,  0.44064899, -0.99977125, -0.39533485],
       [-0.70648822, -0.81532281, -0.62747958, -0.30887855],
       [-0.20646505,  0.07763347, -0.16161097,  0.370439  ]])

In [6]:
weights1 # Each row corresponds to 1 l1 ouput

array([[-0.5910955 ],
       [ 0.75623487],
       [-0.94522481],
       [ 0.34093502]])

## Make a prediction

In [7]:
l0 = X
l1 = non_linear(np.dot(l0, weights0))

In [8]:
l1 # Each row represents 1 training point, each column represents output of 1 neuron

array([[0.44856632, 0.51939863, 0.45968497, 0.59156505],
       [0.28639589, 0.32350963, 0.31236398, 0.51538526],
       [0.40795614, 0.62674606, 0.23841622, 0.49377636],
       [0.25371248, 0.42628115, 0.14321233, 0.41732254]])

In [9]:
l2 = non_linear(np.dot(l1, weights1))

In [10]:
l2 # Each row corresponds to our NN's probability of 1 prediction

array([[0.47372957],
       [0.48895696],
       [0.54384086],
       [0.54470837]])

## Backpropagation

In [11]:
error = y - l2 # How off our prediction is

In [12]:
error

array([[-0.47372957],
       [ 0.51104304],
       [ 0.45615914],
       [-0.54470837]])

In [20]:
error_delta = error * non_linear(l2, deriv=True) # Error scaled inversely with confidence

In [21]:
error_delta

array([[-0.11810546],
       [ 0.12769844],
       [ 0.11316304],
       [-0.13508831]])

In [22]:
l1_error = np.dot(error_delta, weights1.T)

In [23]:
l1_error # Each row corresponds to one training point (error), each column to the error contribution of 1 l1 output

array([[ 0.0698116 , -0.08931546,  0.11163621, -0.04026629],
       [-0.07548197,  0.09657001, -0.12070373,  0.04353687],
       [-0.06689016,  0.08557784, -0.10696451,  0.03858124],
       [ 0.07985009, -0.10215849,  0.12768882, -0.04605634]])

In [24]:
l1_delta = l1_error * non_linear(l1, deriv=True) # Error contribution scaled inversely with confidence

In [25]:
l1_delta

array([[ 0.01726822, -0.02229526,  0.02772761, -0.00972897],
       [-0.0154265 ,  0.02113446, -0.02592628,  0.01087391],
       [-0.01615584,  0.02001969, -0.01942197,  0.00964382],
       [ 0.01511901, -0.02498445,  0.01566774, -0.01119926]])

### Adjusting weights

In [26]:
np.dot(l0.T, l1_delta)
# Each row corresponds to changes of weights of one input
# Each column corresponds to the change in weights of one hidden layer neuron
# Values are sum of (input_value * scaled_error_contribution_using_input_value)

array([[-0.00103683, -0.00496476, -0.00375422, -0.00155545],
       [-0.00030749, -0.00384999, -0.01025854, -0.00032535],
       [ 0.00080489, -0.00612555, -0.0019529 , -0.00041051]])

In [None]:
weights1 += np.dot(l1.T, error_delta) # Change by sum of (input * scaled error)
weights0 += np.dot(l0.T, l1_delta) # Change by sum of (input * scaled error)

## Full code

In [33]:
import numpy as np

X = np.array([[0,0,1],
            [0,1,1],
            [1,0,1],
            [1,1,1]])
                
y = np.array([[0],
              [1],
              [1],
              [0]])

np.random.seed(1)

weights0 = 2*np.random.rand(3,4) - 1 # Connects 3 inputs to a hidden layer with 4 neurons
weights1 = 2*np.random.rand(4,1) - 1 # Connects 4 neurons to 1 output neuron

for j in range(60000):
    l0 = X
    l1 = non_linear(np.dot(l0, weights0))
    l2 = non_linear(np.dot(l1, weights1))
    error = y - l2 # How off our prediction is
    error_delta = error * non_linear(l2, deriv=True) # Error scaled inversely with confidence
    l1_error = np.dot(error_delta, weights1.T)
    l1_delta = l1_error * non_linear(l1, deriv=True) # Error contribution scaled inversely with confidence
    weights1 += np.dot(l1.T, error_delta) # Change by sum of (input * scaled error)
    weights0 += np.dot(l0.T, l1_delta) # Change by sum of (input * scaled error)
    if (j % 10000 == 0):
        print(np.mean(np.abs(error)))

0.4964100319027255
0.008584525653247157
0.005789459862507806
0.004629176776769983
0.003958765280273646
0.0035101225678616736


## My performance using memory

Had to check original for the following:
* Formulas for sigmoid function
* How to calculate l1_error
* How to adjust weights using delta values