In [80]:

# This code was informed by Justin Johnsons Deep Learning for Vision lectures at Michigan, available on Youtube,
# Andrej Karpathy/Justin Johnson/Fei-Fei Li's cs231n lectures at Stanford and The Independent Code's video on Youtube, Neural Network from Scratch.

import numpy as np
import tensorflow as tf

def GenerateTrainingData(min,max):
    '''GenerateTrainingData: example of use: DesignMatrix,TrainingValues= GenerateTrainingData(1,11)
    generates data points from 1 to 10 in steps of 1 and uses them to calculate dependent values using 
    the function defined in ModelFuction()'''
    DesignMatrixString=[]  # Generate as string and then convert, below, to array 
    TrainingValues=[]
    for data in range(min,max):
        DesignMatrixString.append(data)
    DesignMatrix=np.asarray(DesignMatrixString)
    TrainingValues=ModelFunction(DesignMatrix)
    return(DesignMatrix,TrainingValues)  

def ModelFunction(DesignMatrix):
    return(5.0+DesignMatrix*3.0)


class layer:  # set generic features to be inherited
  def __init__(self, features_in=1, nodes_in_layer=1):
    self.features_in = feature_in
    self.nodes_in_layer = nodes_in_layer




class dense_layer(): 
  ''' dense_layer: A layer is treated as an independent entity with its own feature vector input, being the original feature vector for 
      the first hidden layer, or the output of the previous layer for every subsequent layer. The output is a vector to be the 
      input for the next layer. The dense layer only performs the matrix multipliation. An activation function must be included as
      a subsequent layer, if desired. '''
  '''Could set up a layer class and inherit self.feature_in and self.nodes_in_layer. 
    Also Forward and reverse pass could be inherited. 
     but seems unnecessary as long as have only one layer type. '''
  def __init__(self, number_of_features=1, nodes_in_layer=1): #default to a 1 node layer, with one input.
    self.number_of_features = number_of_features
    self.nodes_in_layer = nodes_in_layer
    self.weights=np.random.randn(nodes_in_layer, number_of_features)  # weights initialised from a normal distribution.
    self.bias = np.random.randn(nodes_in_layer, 1)
   

  def forward_pass(self,feature_vector): # returns final value of the layer
    self.feature_vector = feature_vector # needed for backward_pass
    return np.dot(self.weights, feature_vector) + self.bias
   

  def backward_pass(self, upstream_gradient, learning_rate): 
    '''dense_layer backward_pass: updates the weights of the layer and the upstreamstream gradient for the next layer 
       (i.e. downstream gradient of this layer - see Justin Johnson's Deep Learning for Vision lecture number 6 for dicussion).
       Currently hardcoded to run gradient descent. Other options to follow!'''
    dL_dw = np.dot(upstream_gradient, np.transpose(self.feature_vector)) # loss function with respect to weights
    print("upstream_gradient, learning_rate", upstream_gradient, learning_rate)
    print("dL_dw",dL_dw )
    self.weights += -learning_rate * dL_dw
    self.bias += -learning_rate * upstream_gradient
    dL_dinput = np.transpose(self.weights).dot(upstream_gradient)  # sensitivity of loss function to feature vector of the layer 
    return dL_dinput

class sigmoid_layer():
  def __init__(self):
    pass

  def forward_pass(feature_vector):
    self.sigmoid = sigmoid(feature_vector)  # save for backward pass
    return self.sigmoid

  
  def backward_pass(self, upstream_gradient):
    local_gradient = (1 - self.sigmoid)/self.sigmoid
    print("self.sigmoid in backward_pass, local_gradient", self.sigmoid, local_gradient)
    dL_dinput = upstream_gradient * local_gradient  
    return dL_dinput

def sigmoid(x):
    ''' sigmoid: sigmoid function, i.e. 1/(1+ exp(-x)) acting on the input value. If 
  the input is x, then exp(-x) has a large positive x then exp(-x) becomes very small
  and 1/(1+exp(-x)) suffers from rounding errors/numerical instability. Therefore
  need to evaluate as exp(x)/(1+exp(x)) for positive x. the numpy where command is a 
  succint way to code this. '''
    return np.where(x < 0, np.exp(x)/(1 + np.exp(x)), 1/(1 + np.exp(-x)))

# loss functions aka error functions:
def mse(y, y_hat): 
  '''mse: mean squared error loss: 0.5(y_hat - y)squared. y_hat is the estimate from the network, y is the ground truth'''
  return 0.5*np.square(y_hat - y)

def mse_gradient(y, y_hat):
  '''gradient of mse: mean squared error loss: (y_hat - y). y_hat is the estimate from the network, y is the ground truth'''
  return y_hat - y

# testing modules.
def test_mse():
  print("test the mean squared error loss function (mse())\n")
  for y, y_hat in [(1,1), (10,1)]:
    print("y_hat: {}\n y: {} \n mse: {}\n".format(y_hat, y, mse(y, y_hat)))
  return

def test_mse_gradient():
  print("test the calculation of the gradient of the mean squared error loss function (mse_gradient())\n")
  for y, y_hat in [(1,1), (10,1)]:
    print("y_hat: {}\n y: {} \n mse_gradient: {}\n".format(y_hat, y, mse_gradient(y, y_hat)))
  return


def test_sigmoid():
  y = sigmoid(0)
  z = sigmoid(1000)
  omega = sigmoid(-1000)
  a = sigmoid(1)
  b = sigmoid(-1)
  print("test sigmoid inputs: 0, 1000, -1000, 1, -1: ouputs:", y, z, omega, a, b)


def test_sigmoid_layer_backprop():
  print("\n testing sigmoid backprop")
  test = sigmoid_layer()
  test.sigmoid = 2
  local_gradient = (1 - test.sigmoid)/test.sigmoid
  upstream_gradient = np.array([1,2,3])
  downstream_gradient = test.backward_pass(upstream_gradient)
  print("local gradient: {} \n value of sigmoid function: {} \n".format(local_gradient, test.sigmoid)) 
  print("upstream gradent: {} \n downstream grad. :{}\n".format(upstream_gradient,downstream_gradient ))




# Test layers. 
'''
l0 = dense_layer(2,1)
print("bias: ", l0.bias)
print("weights: ", l0.weights)
forward = l0.forward_pass([[1],[1]])
print("feature_vector: ", l0.feature_vector)
print(forward)
gradient= l0.backward_pass(1, 0.1)
print("gradient:", gradient)

l0 = dense_layer(2,1)
print("bias: ", l0.bias)
print("weights: ", l0.weights)
forward = l0.forward_pass([[1],[2]])
print("feature_vector: ", l0.feature_vector)
print(forward)
gradient= l0.backward_pass(2, 0.1)
print("gradient:", gradient)

#print(dense_layer.backward_pass.__doc__)   
#test_sigmoid()
#test_sigmoid_layer_backprop()
#test_mse()
#test_mse_gradient()
'''

network = [ dense_layer(1,1)]
DesignMatrix,TrainingValues= GenerateTrainingData(1,11)
#network = [dense_layer(), sigmoid_layer(), dense_layer()]


X=np.transpose(DesignMatrix)
Y=np.transpose(TrainingValues)
print(X,Y)
epochs = 150
learning_rate = 0.1
loss = 0
# define the error function
error_function=mse  
error_grad=mse_gradient

for epoch in range(epochs):
  for x, y in zip(X,Y): # pairing feature vector and dependent variables and then iterating over each pairing. Zip zips the two vectors together into a list of tuples.
    next_input = x
    print("epoch", epoch)
    print("next input: ", next_input)
    for layer in network:  # for each data entry, x, y, we iterate through the whole network
      next_input = layer.forward_pass(next_input)  # output of one layer is to be the input of the next
    loss += error_function(y,next_input) #next_input is at this point y_hat, the predicted value; this line sums over all the data

  
    grad=1/len(Y)*error_grad(y,next_input)  
    for layer in reversed(network):
      grad = layer.backward_pass(grad, learning_rate)  # weights updated on a per data pair basis, i.e. stochastic gradient descent.

  loss /= len(Y)
  print("epoch {} of {},  error = {}".format(epoch + 1, epochs, loss))
for layer in network:
  print("bias: {} \n weight {} \n".format(layer.bias,layer.weights))

'''
for epoch in range(epochs):
  print("epoch", epoch)
  for layer in network:  # for each data entry, x, y, we iterate through the whole network
    next_input = layer.forward_pass(X)  # output of one layer is to be the input of the next
  loss += error_function(Y,next_input) #next_input is at this point y_hat, the predicted value; this line sums over all the data

  
  grad=1/len(Y)*error_grad(Y,next_input)  
  for layer in reversed(network):
    grad = layer.backward_pass(grad, learning_rate)

  loss /= len(Y)
  print("epoch {} of {},  error = {}".format(epoch + 1, epochs, loss))'''






[ 1  2  3  4  5  6  7  8  9 10] [ 8. 11. 14. 17. 20. 23. 26. 29. 32. 35.]
epoch 0
next input:  1
upstream_gradient, learning_rate [[-0.60334413]] 0.1
dL_dw [[-0.60334413]]
epoch 0
next input:  2
upstream_gradient, learning_rate [[-0.73972456]] 0.1
dL_dw [[-1.47944911]]
epoch 0
next input:  3
upstream_gradient, learning_rate [[-0.83639115]] 0.1
dL_dw [[-2.50917344]]
epoch 0
next input:  4
upstream_gradient, learning_rate [[-0.86131311]] 0.1
dL_dw [[-3.44525245]]
epoch 0
next input:  5
upstream_gradient, learning_rate [[-0.78899844]] 0.1
dL_dw [[-3.94499221]]
epoch 0
next input:  6
upstream_gradient, learning_rate [[-0.61851748]] 0.1
dL_dw [[-3.7111049]]
epoch 0
next input:  7
upstream_gradient, learning_rate [[-0.3872136]] 0.1
dL_dw [[-2.71049522]]
epoch 0
next input:  8
upstream_gradient, learning_rate [[-0.16404944]] 0.1
dL_dw [[-1.31239549]]
epoch 0
next input:  9
upstream_gradient, learning_rate [[-0.01473598]] 0.1
dL_dw [[-0.13262385]]
epoch 0
next input:  10
upstream_gradient, lea

'\nfor epoch in range(epochs):\n  print("epoch", epoch)\n  for layer in network:  # for each data entry, x, y, we iterate through the whole network\n    next_input = layer.forward_pass(X)  # output of one layer is to be the input of the next\n  loss += error_function(Y,next_input) #next_input is at this point y_hat, the predicted value; this line sums over all the data\n\n  \n  grad=1/len(Y)*error_grad(Y,next_input)  \n  for layer in reversed(network):\n    grad = layer.backward_pass(grad, learning_rate)\n\n  loss /= len(Y)\n  print("epoch {} of {},  error = {}".format(epoch + 1, epochs, loss))'