In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import truncnorm

class Network:
    def __init__(self, input_size, output_size=1, hidden_layers=1, neurons=1, init='normal', activation='sigmoid', learning_rate=0.1):
        self.input_size = input_size
        self.output_size = output_size
        self.layers = hidden_layers
        self.neurons = neurons
        self.activation = activation
        self.learning_rate = learning_rate
        
        # For convenience, neurons are arranged in row vectors. 
        # This is contrary to most NN graphs, but it avoids rearranging the weights for the dot product
        
        def truncated_normal(mean=0, sd=1, low=0, upp=10):
            return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)
        
        # Create initial guess for weights and biases
        self.W = []
        if init.lower() == 'ones':
            weights_in_hidden = np.ones((input_size,neurons))
            weights_hidden_out = np.ones((neurons,output_size))
            self.B = [np.zeros((1,neurons)),np.zeros((1,output_size))]
        elif init.lower() == 'truncated_normal':
            rad = 1 / np.sqrt(input_size)
            X = truncated_normal(mean=2,sd=1,low=-rad,upp=rad)
            weights_in_hidden = X.rvs((input_size,neurons))
            rad = 1 / np.sqrt(neurons)
            X = truncated_normal(mean=2,sd=1,low=-rad,upp=rad)
            weights_hidden_out = X.rvs((neurons,output_size))
            self.B = [np.random.randn(1,neurons),np.random.randn(1,output_size)]
        else:
            weights_in_hidden = np.random.randn(input_size,neurons)
            weights_hidden_out = np.random.randn(neurons,output_size)
            self.B = [np.random.randn(1,neurons),np.random.randn(1,output_size)]
        
        self.W.append(weights_in_hidden)
        self.W.append(weights_hidden_out)
        
        # Initialize cache for neuron values (needed for the error backpropagation step)
            # We store values both before and after activation
            # neuron_cache contains all neuron values, including input, hidden layers (activated), and output (activated)
            # output_cache contains pre activation output from previous layers
        self.neuron_cache = []
        self.output_cache = []
    
    def activate(self, x):
        if self.activation.lower()=='relu':
            return Activation.reLU(x)
        elif self.activation.lower()=='softmax':
            return Activation.softmax(x)
        else:
            return Activation.sigmoid(x)
    
    def deriv_activation(self, x):
        if self.activation.lower()=='relu':
            return Activation.deriv_reLU(x)
        else:
            return Activation.deriv_sigmoid(x)
        
    # Uses previous neuron values (input) to compute the values in the next layer (output)
    def feedforward(self, input):
        self.neuron_cache = [input]
        self.output_cache = []
        for layer in range(len(self.W)):
            weights = self.W[layer]
            bias = self.B[layer]

            output = np.dot(input,weights)+bias
            self.output_cache.append(output)
            neurons = self.activate(output)
            self.neuron_cache.append(neurons)
            input = neurons
        return neurons
    
    def loss(self, y_true,y_pred):
        y_pred = y_pred.reshape(y_true.shape)
        return ((y_true - y_pred)**2).mean()
    
    # Computes the loss between prediction and true training output, then propagates the gradient backwards
    def backprop(self, y_true,y_pred, learning_rate):
        # loss derivative
        dL = -2 * (y_true - y_pred)
        dh = 1
        for layer in range(len(self.W)-1,-1,-1):
            weights = self.W[layer]
            bias = self.B[layer]
            h = self.neuron_cache[layer]
            o = self.output_cache[layer]
            
            # read dh from previous layer and update dL for next layer
            dL = dh * dL
            
            f = self.deriv_activation(o)
            dhdw = np.dot(h.T,f)
            
            dw = -learning_rate * dL * dhdw
            db = -learning_rate * dL * f
            
            # pass dh to next layer
            dh = np.dot(f,weights.T)
            
            # update weights and biases
            self.W[layer] += dw
            self.B[layer] += db
            print("\tThe weights increment are:")
            print(dw)
            print()
        pass
    
    def train(self, data,all_y_true,epochs=5):
        learning_rate = self.learning_rate
        print('Training the network on %s data points, for %s epochs, with learning rate = %s\n' %(len(all_y_true),epochs,learning_rate))
        Losses = []
        for epoch in range(epochs):
            for input, y_true in zip(data,all_y_true):
                print("Training data point is: %s" %input)
                print("Expected output is: %s" %y_true)
                input = input.reshape(1,len(input))
                y_pred = self.feedforward(input)
                print("Predicted output is: %s" %y_pred)
                print("\tWeights are:")
                print(self.W)
                self.backprop(y_true,y_pred,learning_rate)
            pass
            # Calculate total loss at the end of each epoch
            if epoch % 1 == 0:
                predictions = np.apply_along_axis(self.feedforward,1,data)
                L = self.loss(all_y_true, predictions)
                Losses.append(L)
                print("Epoch %d loss: %.3f\n" % (epoch+1, L))
        print()
        return Losses

    
class Activation:
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    def deriv_sigmoid(x):
        fx = 1 / (1 + np.exp(-x))
        return fx * (1 - fx)
        
    def reLU(x):
        return np.maximum(0,x)
    def deriv_reLU(x):
        x2 = x.reshape(max(x.shape))
        y2 = x2*0
        for i in range(len(x2)):
            if x2[i] != 0:
                y2[i] = np.maximum(0,x2[i]) / x2[i]
            else:
                y2[i] = 0
        y = y2.reshape(x.shape)
        return y
    
    def softmax(x):
        return np.exp(x) / sum(np.exp(x))

data = np.array([
  [-2, -1],  # Alice
  [25, 6],   # Bob
  [17, 4],   # Charlie
  [-15, -6], # Diana
])
all_y_true = np.array([
  1, # Alice
  0, # Bob
  0, # Charlie
  1, # Diana
])

network = Network(2,1,1,2,'ones')
L1 = network.train(data,all_y_true)
network = Network(2,1,1,2,'normal')
L2 = network.train(data,all_y_true)
network = Network(2,1,1,2,'truncated_normal')
L3 = network.train(data,all_y_true)

# Plot training
plt.figure(figsize=(15,10))
plt.plot(L1,label='Ones')
plt.plot(L2,label='Normal')
plt.plot(L3,label='truncated normal')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')
plt.legend()
plt.show()

# Make some predictions
emily = np.array([-7, -3]) # 128 pounds, 63 inches
frank = np.array([20, 2])  # 155 pounds, 68 inches
print("Emily: %.3f" % network.feedforward(emily)) # 0.951 - F
print("Frank: %.3f" % network.feedforward(frank)) # 0.039 - M

data = np.array([
  [-2, -1],  # Alice
  [25, 6],   # Bob
  [17, 4],   # Charlie
  [-15, -6], # Diana
])
all_y_true = np.array([
  1, # Alice
  0, # Bob
  0, # Charlie
  1, # Diana
])

network = Network(2,1,1,2,_,_,1)
L1 = network.train(data,all_y_true)
network = Network(2,1,1,2,_,_,0.1)
L2 = network.train(data,all_y_true)
network = Network(2,1,1,2,_,_,0.01)
L3 = network.train(data,all_y_true)

# Plot training
plt.figure(figsize=(15,10))
plt.plot(L1,label='1')
plt.plot(L2,label='0.1')
plt.plot(L3,label='0.01')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')
plt.legend()
plt.show()

# Make some predictions
emily = np.array([-7, -3]) # 128 pounds, 63 inches
frank = np.array([20, 2])  # 155 pounds, 68 inches
print("Emily: %.3f" % network.feedforward(emily)) # 0.951 - F
print("Frank: %.3f" % network.feedforward(frank)) # 0.039 - M

In [2]:
data = np.array([
  [-2, -1],  # Alice
  [25, 6],   # Bob
  [17, 4],   # Charlie
  [-15, -6], # Diana
])
all_y_true = np.array([
  1, # Alice
  0, # Bob
  0, # Charlie
  1, # Diana
])

network = Network(2,1,1,2,_,'reLU')
network.train(data,all_y_true)


Training the network on 4 data points, for 5 epochs, with learning rate = 0.1

Training data point is: [-2 -1]
Expected output is: 1
Predicted output is: [[13.90215459]]
	Weights are:
[array([[-0.23179511, -2.3163552 ],
       [-1.24134263,  0.47639697]]), array([[-0.23173698],
       [ 2.87462014]])]
	The weights increment are:
[[ -0.20899093]
 [-13.37962626]]

	The weights increment are:
[[-1.19596253 14.83551735]
 [-0.59798127  7.41775868]]

Training data point is: [25  6]
Expected output is: 0
Predicted output is: [[0.]]
	Weights are:
[array([[-1.42775764, 12.51916215],
       [-1.8393239 ,  7.89415565]]), array([[ -0.4407279 ],
       [-10.50500613]])]
	The weights increment are:
[[0.]
 [0.]]

	The weights increment are:
[[0. 0.]
 [0. 0.]]

Training data point is: [17  4]
Expected output is: 0
Predicted output is: [[0.]]
	Weights are:
[array([[-1.42775764, 12.51916215],
       [-1.8393239 ,  7.89415565]]), array([[ -0.4407279 ],
       [-10.50500613]])]
	The weights increment are

[0.5, 0.5, 0.5, 0.5, 0.5]

data = np.array([
  [-2, -1],  # Alice
  [25, 6],   # Bob
  [17, 4],   # Charlie
  [-15, -6], # Diana
])
all_y_true = np.array([
  1, # Alice
  0, # Bob
  0, # Charlie
  1, # Diana
])

network = Network(2,1,1,2,_,'sigmoid',1)
L1 = network.train(data,all_y_true)
network = Network(2,1,1,2,_,'reLU',1)
L2 = network.train(data,all_y_true)

# Plot training
plt.figure(figsize=(15,10))
plt.plot(L1,label='sigmoid')
plt.plot(L2,label='reLU')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')
plt.legend()
plt.show()

fig, ax = plt.subplots(1,2,figsize=(20,5))
axis = np.linspace(-5,5,101)
print(axis)
ax[0].plot(axis,Activation.reLU(axis))
ax[1].plot(axis,Activation.deriv_reLU(axis))