In [1]:
import numpy as np

# Feed-Forward Neural Network in NumPy

This notebook walks through the process of constructing a feed-forward neural network for multi-class classification solely using NumPy.

## Layers

For our neural network, we want to abstract away from individual neurons and focus on layers. Each element of the network will be defined by a certain layer.

### Base Layer

The abstract base layer ensures that all called methods by the `Sequential` model exist on the layers. 

In [27]:
class Layer:
    """Base class for neural network layers."""
    pass

In this case, we only need to worry about the `sgd_step` method as some layers won't need to update any weights because they don't have any (e.g. activation layers).

In [31]:
    def sgd_step(self):
        """Some layers do not have weights to update on gradient descent steps."""
        pass

# Add this method to the Layer class
Layer.sgd_step = sgd_step

### Linear Layer

This is the simplest layer that makes up the majority of our neural network.

In [32]:
class Linear(Layer):
    """A simple, fully-connected linear layer."""
    pass

To set up this layer, we need to know the input and output dimensions ahead of time. Using this information, we randomly initialize the weight matrices.

In [34]:
    def __init__(self, m, n):
        """Initializes the layer based on input and output dimensions. 

        Note: Kernel is initialized using normal distribution with mean 0 and 
        variance 1 / m. All biases are initialized to zero.

        Args:
            m (int): Number of inputs to the layer.
            n (int): Number of outputs from the layer.

        """
        self.m, self.n = m, n

        self.W0 = np.zeros((n, 1))
        self.W = np.random.normal(0, np.sqrt(1 / m), (m, n))

# Add this method to the Linear layer class
Linear.__init__ = __init__

The `forward` method will compute the output of the layer given a set of $m$ inputs from the previous layer for a batch of size $b$.

In [35]:
    def forward(self, A):
        """Computes the forward pass through the linear network for a batch.

        Args:
            A (ndarray): An m by b matrix representing the m activations from the
                previous layer for a batch of size b.

        Returns:
            ndarray: An n by b matrix representing the result of passing the 
                activations through the network layer for a batch of size b.

        """
        self.A = A

        return self.W.T @ self.A + self.W0

# Add this method to the Linear layer class
Linear.forward = forward

The `backward` method will compute the gradient of the loss with respect to the inputs to the layer for a batch of size $b$. Note: There is an implicit sum over all $b$ in the `dLdW` calculation.

In [36]:
    def backward(self, dLdZ):
        """Uses the gradient of loss with respect to outputs of the layer for a 
        batch to update the sum of gradients of the loss with respect to the 
        weights for the entire batch. Also returns the gradient of the loss with 
        respect to the inputs to the layer for a batch.

        Args:
            dLdZ (ndarray): An n by b matrix representing the gradient of the loss
                with respect to the layer outputs for a batch of size b.

        Returns:
            ndarray: An m by b matrix representing the gradient of the loss with 
                respect to the inputs to the layer for a batch of size b.

        """
        self.dLdW = self.A @ dLdZ.T  # Implicit sum over all b
        self.dLdW0 = np.sum(dLdZ, axis=1, keepdims=True)

        return self.W @ dLdZ

# Add this method to the Linear layer class
Linear.backward = backward

In [3]:
class Linear(Layer):
    """A simple, fully-connected linear layer."""

    def __init__(self, m, n):
        """Initializes the layer based on input and output dimensions. 

        Note: Kernel is initialized using normal distribution with mean 0 and 
        variance 1 / m. All biases are initialized to zero.

        Args:
            m (int): Number of inputs to the layer.
            n (int): Number of outputs from the layer.

        """
        self.m, self.n = m, n

        self.W0 = np.zeros((n, 1))
        self.W = np.random.normal(0, np.sqrt(1 / m), (m, n))

    def forward(self, A):
        """Computes the forward pass through the linear network for a batch.

        Args:
            A (ndarray): An m by b matrix representing the m activations from the
                previous layer for a batch of size b.

        Returns:
            ndarray: An n by b matrix representing the result of passing the 
                activations through the network layer for a batch of size b.

        """
        self.A = A

        return self.W.T @ self.A + self.W0

    def backward(self, dLdZ):
        """Uses the gradient of loss with respect to outputs of the layer for a 
        batch to update the sum of gradients of the loss with respect to the 
        weights for the entire batch. Also returns the gradient of the loss with 
        respect to the inputs to the layer for a batch.

        Args:
            dLdZ (ndarray): An n by b matrix representing the gradient of the loss
                with respect to the layer outputs for a batch of size b.

        Returns:
            ndarray: An m by b matrix representing the gradient of the loss with 
                respect to the inputs to the layer for a batch of size b.

        """
        self.dLdW = self.A @ dLdZ.T  # Implicit sum over all b
        self.dLdW0 = np.sum(dLdZ, axis=1, keepdims=True)

        return self.W @ dLdZ

    def sgd_step(self, lrate):
        """Performs a single step of gradient descent to update the weights for a 
        single batch of points.

        Args:
            lrate (float): A learning rate to scale the gradient for the update.

        """
        self.W = self.W - lrate * self.dLdW
        self.W0 = self.W0 - lrate * self.dLdW0

In [4]:
class Tanh(Layer):
    """Hyperbolic tangent activation layer."""

    def forward(self, Z):
        """Computes the output of the hyperbolic tangent activation layer.

        Args:
            Z (ndarray): An n by b matrix representing the input pre-activations
                of the layer for a batch of size b.

        Returns:
            ndarray: An n by b matrix representing the output of the layer after
                using the hyperbolic tangent activation on all inputs for a batch
                of size b.

        """
        self.A = np.tanh(Z)

        return self.A

    def backward(self, dLdA):
        """Computes the gradient of the loss with respect to the inputs to the
        layer using the gradient of the loss with respect to the outputs of the
        layer for a single batch.

        Args:
            dLdA (ndarray): An n by b matrix representing the gradient of the loss
                with respect to the outputs for the layer for a batch of size b.

        Returns:
            ndarray: An n by b matrix representing the gradient of the loss with
                respect to the inputs of the layer for a batch of size b.

        """
        return (1 - self.A ** 2) * dLdA

In [5]:
class ReLU(Layer):
    """Rectified linear unit layer."""
    
    def forward(self, Z):
        """Computes the output of the rectified linear unit layer.
        
        Args:
            Z (ndarray): An n by b matrix representing the input pre-activations
                of the layer for a batch of size b.
        
        Returns:
            ndarray: An n by b matrix representing the output of the layer after
                using the rectified linear activation on all inputs for a batch
                of size b.
        
        """
        self.A = np.maximum(0, Z)
        
        return self.A

    def backward(self, dLdA):
        """Computes the gradient of the loss with respect to the inputs to the
        layer using the gradient of the loss with respect to the outputs of the
        layer for a single batch.

        Args:
            dLdA (ndarray): An n by b matrix representing the gradient of the loss
                with respect to the outputs for the layer for a batch of size b.

        Returns:
            ndarray: An n by b matrix representing the gradient of the loss with
                respect to the inputs of the layer for a batch of size b.

        """
        return dLdA * (self.A != 0)

In [29]:
class SoftMax(Layer):
    """Softmax activation layer."""

    def forward(self, Z):
        """Computes the softmax activation given the inputs from the previous
        layer for a single batch.

        Args:
            Z (ndarray): An n by b matrix representing the inputs to the softmax
                layer for a batch of size b.

        Returns:
            ndarray: An n by b matrix of outputs from softmax for a batch of 
                size b.

        """
        self.A = np.exp(Z) / np.sum(np.exp(Z), axis=0, keepdims=True)
        
        return self.A

    def backward(self, dLdA):
        """Computes the gradient of the loss with respect to the inputs to the
        layer using the gradient of the loss with respect to the outputs of the
        layer for a single batch.

        Args:
            dLdA (ndarray): An n by b matrix representing the gradient of the loss
                with respect to the outputs for the layer for a batch of size b.

        Returns:
            ndarray: An n by b matrix representing the gradient of the loss with
                respect to the inputs of the layer for a batch of size b.
                
        """
        n, _ = dLdA.shape
        
        dAdZ = np.einsum('jk,jk,ji->ijk', self.A, 1 - self.A, np.eye(n)) \
                + np.einsum('jk,ik,ji->ijk', -self.A, self.A, 1 - np.eye(n))
        
        return np.einsum('ikj,kj->ij', dAdZ, dLdA)

    def class_fun(self, Ypred):  # Return class indices
        """Computes the index of maximum value given the softmax outputs from a
        layer for a single batch.

        Args:
            Ypred (ndarray): An n by b matrix representing the softmax outputs of a
                layer for a batch of size b.

        Returns:
            ndarray: A 1 by b row vectors representing the indices of maximum value
                for each output from a batch of size b.

        """
        return np.argmax(Ypred, axis=0)

In [257]:
Y = np.array([[1, 0],
              [0, 0],
              [0, 1]])

In [258]:
A = np.array([[5, 4],
              [7, 9],
              [3, 2]])

In [259]:
dAdZ0 = np.array([[A[0,0]*(1-A[0,0]),    -A[0,0]*A[1,0],    -A[0,0]*A[2,0]],
                  [   -A[1,0]*A[0,0], A[1,0]*(1-A[1,0]),    -A[1,0]*A[2,0]],
                  [   -A[2,0]*A[0,0],    -A[2,0]*A[1,0], A[2,0]*(1-A[2,0])]])

In [260]:
dAdZ0

array([[-20, -35, -15],
       [-35, -42, -21],
       [-15, -21,  -6]])

In [289]:
dAdZ = np.einsum('ji,jk->ijk', A * (1 - A), np.eye(3)) \
       + np.einsum('ki,ji,jk->ijk', -A, A, np.ones((3, 3)) - np.eye(3))

In [290]:
dAdZ

array([[[-20., -35., -15.],
        [-35., -42., -21.],
        [-15., -21.,  -6.]],

       [[-12., -36.,  -8.],
        [-36., -72., -18.],
        [ -8., -18.,  -2.]]])

In [263]:
dLdA = -Y / A

In [264]:
dLdA

array([[-0.2,  0. ],
       [ 0. ,  0. ],
       [ 0. , -0.5]])

In [284]:
dLdZ0 = dAdZ0 @ dLdA[:, 0:1]

In [285]:
dLdZ0

array([[4.],
       [7.],
       [3.]])

In [296]:
dLdZ = np.einsum('jik,kj->ij', dAdZ, dLdA)

In [297]:
dLdZ

array([[4., 4.],
       [7., 9.],
       [3., 1.]])

In [7]:
class NLL(Layer):
    """Negative log-likelihood loss layer."""

    def forward(self, Ypred, Y):
        """Computes the loss given the predicted and actual results.

        Args:
            Ypred (ndarray): An n by b matrix representing the predicted results
                from the network for a batch of size b.
            Y (ndarray): An n by b matrix representing the actual expected results
                for a batch of size b.

        Returns:
            float: A scalar representing the total loss for each of the outputs
                in a batch of size b.

        """
        self.Ypred = Ypred
        self.Y = Y

        return -np.sum(self.Y * np.log(self.Ypred))

    def backward(self):
        """Computes the gradient of the loss with respect to predicted targets for
        a single batch.
        
        Returns:
            ndarray: An n by b matrix representing the gradient of loss with
                respect to predicted targets for a batch of size b.
                
        """
        return -self.Y / self.Ypred

In [10]:
class Sequential:
    """A standard neural network model with linear stacked layers."""
    
    def __init__(self, modules, loss):
        """Initialize the modules and the loss for the network.
        
        Args:
            modules (list of Module): A list of modules to make up the linear
                neural network.
            loss (Module): A final module to use to compute the loss of the
                neural network.
        
        """
        self.modules = modules
        self.loss = loss

    def sgd(self, X, Y, iters=100, lrate=0.005):
        """Trains the neural network by running stochastic gradient descent.
        
        Args:
            X (ndarray): A d by n matrix representing n training data points
                each with d dimensions.
            Y (ndarray): A 1 by n matrix representing n training labels.
            iters (int): The number of iterations to run stochastic graident
                descent.
            lrate (float): The step size for stochastic gradient descent.
        
        """
        d, n = X.shape
        
        for it in range(iters):
            
            t = np.random.randint(n)
            
            Xt = X[:, t:t + 1]
            Yt = Y[:, t:t + 1]
            
            loss = self.loss.forward(self.forward(Xt), Yt)
            self.backward(self.loss.backward())      
            
            self.print_accuracy(it, X, Y, loss)
            
            self.sgd_step(lrate)

    def forward(self, Xt):
        """Predicts the output for a training input batch.
        
        Args:
            Xt (ndarray): A d by b matrix of points to predict
                with dimension d and batch size b.
        
        Returns:
            ndarray: A 1 by b matrix representing the predicted
                outputs of the neural network for a batch size b.
        
        """
        for m in self.modules:
            Xt = m.forward(Xt)
            
        return Xt

    def backward(self, dLdA):
        """Computes the gradients of the loss with respect to each weight
        in the neural network to prepare for stochastic gradient descent.
        
        Args:
            dLdA (ndarray): An n by b matrix representing the gradient of the
                loss with respect to the outputs of the neural network for a
                batch of size b.
        
        """
        for m in self.modules[::-1]:
            dLdA = m.backward(dLdA)

    def sgd_step(self, lrate):
        """Runs a single update step on the weight matrices throughout the
        neural network using stochastic gradient descent.
        
        Args:
            lrate (float): Learning rate for the update step.
        
        """
        for m in self.modules:
            m.sgd_step(lrate)

    def print_accuracy(self, it, X, Y, cur_loss, every=250):
        """Displays current prediction statistics.
        
        Args:
            it (int): Current iteration.
            X (ndarray): A d by n matrix of n points to evaluate, each with
                d dimensions.
            Y (ndarray): A 1 by n vector of n labels.
            cur_loss (float): Current loss.
            every (int): Frequency to output statistics.
        
        """
        if it % every == 1:
            
            cf = self.modules[-1].class_fun
            acc = np.mean(cf(self.forward(X)) == cf(Y))
            
            print('Iteration =', it, '\tAcc =', acc, '\tLoss =', cur_loss, flush=True)

In [26]:
X, Y = hard()
nn = Sequential([Linear(2, 10), ReLU(), 
                 Linear(10, 10), ReLU(), 
                 Linear(10, 2), SoftMax()], NLL())