In [0]:
%pycat run_tests.py

In [32]:
%%writefile run_tests.py


__author__ = "Xenia Ioannidou"

from k_layer_network import NeuralNetwork
from preprocessor import *
import numpy as np
from build_tests import *


def main():

    tester = Tests()

    # EXERCISE 1a: Checking gradients for 3-layers network
    # tester.exercise_1a()

    # EXERCISE 1b: Checking gradients for 9-layers network
    # tester.exercise_1b()
    # *********************************************************

    # EXERCISE 2: Evolution of the loss for the 3-layer network
    # tester.exercise_2()

    # EXERCISE 3: Evolution of the loss for the 6-layer network
    # tester.exercise_3()
    # *********************************************************

    # EXERCISE 4: Coarse search for lambda value in 3-layer network
    # tester.coarse_search()
    # *********************************************************

    # EXERCISE 5: Fine search for lambda value in 3-layer network
    # tester.fine_search()
    # *********************************************************

    # EXERCISE 6: Sensitivity to initialization in 3-layer network
    # tester.check_sensitivity(True, 1e-1, "BN_1e1")
    # tester.check_sensitivity(False, 1e-1,"noBN_1e1")
    # tester.check_sensitivity(True, 1e-3, "BN_1e3")
    # tester.check_sensitivity(False, 1e-3,"noBN_1e3")
    tester.check_sensitivity(True, 1e-5, "BN_1e5")
    tester.check_sensitivity(False, 1e-5,"noBN_1e5")


if __name__ == "__main__":
    main()

Overwriting run_tests.py


In [0]:
%pycat preprocessor.py

In [6]:
%%writefile preprocessor.py

from collections import OrderedDict

__author__ = "Xenia Ioannidou"

import numpy as np
import pickle

num_of_classes = 10
directory = "//content//drive//My Drive//Colab_Deep_Learning_Lab3/"
file_labels = "//content//drive//My Drive//Colab_Deep_Learning_Lab3//batches.meta"


class Preprocessor():

  def load_batch(self, directory, extension):
      """
          dataset:
              60.000 32x32 colour images in 10 classes, with 6.000 images per class

              5 training batches: 1 batch has 10.000 images, so 50.000 training images and
              5 training batches: 5000 images from each class, 10 classes, so 50.000 training images
              1 test batch: 10.000 test images
              1 image is 32x32x3

          labels:
              a list of 10.000 numbers in the range 0-9
              the number in position i is the label of the i_th image in the array data

          A.data:
              an array 10.000 rows and 3072 columns (32x32x3), where 1 image is one row in A.data

          return:
              the image and label data in separate files
      """

      file = directory + extension

      data = self.unpickle_file(file)

      X, Y, y = self.preprocess_data(data, num_of_classes)

      return X, Y, y


  def unpickle_file(self, filename):

      # serialise the object before writing it to file:
      # convert it into a character stream
      with open(filename, 'rb') as file_opened:
          file_data = pickle.load(file_opened, encoding='bytes')

      return file_data


  def preprocess_data(self, file_data, num_of_classes):
      """
          Args:

              param file_data: normalizing between [0,1]
              X_data: contains the image pixel data which is d x N = (3072, 10000)
              y: vector of length N = 10.000 containing the label for each image
              Y is KxN (K = 10) and contains the one-hot representation of the label for each image

          Return:

              data and labels
      """

      # normalization with respect to the mean and std
      # 1 column in X is 1 image, so mean and std is per column
      X_data = (file_data[b"data"]).T
      X_mean = np.mean(X_data, axis=1, keepdims=True)
      X_stdev = np.std(X_data, axis=1, keepdims=True)

      X_data = (X_data - X_mean) / X_stdev

      # the label for each image - vector of length N
      y = np.asarray(file_data[b"labels"])

      # one-hot representation of the label for each image
      Y = (np.eye(num_of_classes)[y]).T

      return X_data, Y, y

  def build_dataset(self):
      X_train, Y_train, y_train = self.load_batch(directory, "/data_batch_1")
      X_val, Y_val, y_val = self.load_batch(directory, "/data_batch_2")
      X_test, Y_test, y_test = self.load_batch(directory, "/test_batch")

      print("X_train shape = ", X_train.shape)
      print("Y_train shape = ", Y_train.shape)

      print("X_val shape = ", X_val.shape)
      print("Y_val shape = ", Y_val.shape)

      print("X_test shape = ", X_test.shape)
      print("Y_test shape = ", Y_test.shape)

      return X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test

  def train_one_batch(self):
      """
          Build datasets where the training data consists of 10,000 images.

          Returns:
              all the separate data sets and labels (list) with correct image labels
      """
      X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test = self.build_dataset()

      labels = self.unpickle_file(file_labels)[b'label_names']

      return X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, labels

  def find_parameters(self, X, Y):
      K = np.shape(Y)[0]
      D = np.shape(X)[0]
      N = np.shape(X)[1]

      labels = self.unpickle_file(file_labels)[b'label_names']

      C = len(labels)

      return K, D, N, C, labels

  def make_layers(self, shapes, activations):
      """Create the layers of the network
      Args:
          shapes      (list): the shapes per layer as tuples
          activations (list): the activation functions per layer as strings
      Returns:
          layers (OrderedDict): specifies the shape and activation function of
          each layer
      """
      if len(shapes) != len(activations):
          raise RuntimeError('The size of shapes should equal the size of activations.')

      layers = OrderedDict([])

      for i, (shape, activation) in enumerate(zip(shapes, activations)):
          layers["layer%s" % i] = {"shape": shape, "activation": activation}
          print("Layer ", i, " = " , {"shape": shape, "activation": activation})        

      return layers      
  

Overwriting preprocessor.py


In [0]:
%pycat build_tests.py

In [27]:
%%writefile build_tests.py

__author__ = "Xenia Ioannidou"

import statistics

from k_layer_network import NeuralNetwork
from preprocessor import Preprocessor
import numpy as np
import random

file_labels = "//content//drive//My Drive//Colab_Deep_Learning_Lab3//batches.meta"

class Tests():

    def exercise_1a(self):
      print("Checking gradients for 3-layers network")
      shapes=[(50, 30), (50, 50), (10, 50)]
      acts=["relu", "relu", "softmax"]
      self.checking_grads_with_BN(shapes, acts)
    
    def exercise_1b(self):
      print("\nChecking gradients for 9-layers network")
      shapes=[(50, 30), (30, 50), (20, 30), (20, 20), (10, 20),
                  (10, 10), (10, 10), (10, 10), (10, 10)]
      acts=["relu", "relu", "relu", "relu", "relu", "relu",
                  "relu", "relu", "softmax"]
      self.checking_grads_with_BN(shapes, acts)

    def exercise_2(self):
      shapes=[(50, 3072), (50, 50), (10, 50)]
      activations=["relu", "relu", "softmax"]

      # self.training(shapes, activations, batch_norm=False) 
      self.training(shapes, activations, batch_norm=True) 


    def exercise_3(self):
      shapes=[(50, 3072), (30, 50), (20, 30), (20, 20), (10, 20),(10, 10)]
      activations=["relu", "relu", "relu","relu", "relu", "softmax"]

      self.training(shapes, activations, batch_norm=False) 
      self.training(shapes, activations, batch_norm=True)
    
    def check_sensitivity(self, with_BN, std, plt_id):

      if with_BN:
        print("START CHECKING SENSITIVITY TO INITIALIZATION WITH BN")
      else:
        print("START CHECKING SENSITIVITY TO INITIALIZATION WITHOUT BN")

      # Build dataset with one batch
      preprocessor = Preprocessor()
      shapes=[(50, 3072), (50, 50), (10, 50)]
      activations=["relu", "relu", "softmax"]
      X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, labels = preprocessor.train_one_batch()
      layers = preprocessor.make_layers(shapes=shapes, activations=activations)
      random.shuffle(X_train)
      random.shuffle(X_val)
      random.shuffle(X_test)
      K, D, N, C, labels = preprocessor.find_parameters(X_train[:30, :5], Y_train[:30, :5])
      
      # Build network
      net = NeuralNetwork(K, D, N, C, labels, layers, std=std, batch_norm=with_BN)

      # Train network
      acc_train, acc_val, acc_test = net.mini_batch_gd(
                  X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, 
                  Y_test,y_test,labda=0.021,batch_s=100,learning_rate_min=1e-5, 
                  learning_rate_max=1e-1,stepsize=2250,n_epochs=20,
                  plot_id=plt_id, verbose=True)

    def coarse_search(self):
      print("STARTING COARSE SEARCH")

      # Build dataset with one batch
      preprocessor = Preprocessor()
      shapes=[(50, 3072), (50, 50), (10, 50)]
      activations=["relu", "relu", "softmax"]
      X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, labels = preprocessor.train_one_batch()
      layers = preprocessor.make_layers(shapes=shapes, activations=activations)
      K, D, N, C, labels = preprocessor.find_parameters(X_train[:30, :5], Y_train[:30, :5])
      
      # Build network
      net = NeuralNetwork(K, D, N, C, labels, layers, batch_norm=True)

      # Coarse random search of the lambda
      lamda_list = []
      while len(lamda_list) < 30:
          x = np.random.uniform(1e-5, 1e-1)
          lamda_list.append(x)
      
      for lamda_i in lamda_list:
            print("Lambda = ", lamda_i)

            # Train network
            acc_train, acc_val, acc_test = net.mini_batch_gd(
                  X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, 
                  Y_test,y_test,labda=lamda_i,batch_s=100,learning_rate_min=1e-5, 
                  learning_rate_max=1e-1,stepsize=2250,n_epochs=20,verbose=True)

            print("___________________________")

    def fine_search(self):
      print("STARTING FINE SEARCH")

      # Build dataset with one batch
      preprocessor = Preprocessor()
      shapes=[(50, 3072), (50, 50), (10, 50)]
      activations=["relu", "relu", "softmax"]
      X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, labels = preprocessor.train_one_batch()
      layers = preprocessor.make_layers(shapes=shapes, activations=activations)
      K, D, N, C, labels = preprocessor.find_parameters(X_train[:30, :5], Y_train[:30, :5])
      
      # Build network
      net = NeuralNetwork(K, D, N, C, labels, layers, batch_norm=True)

      # Coarse random search of the lambda
      lamda_list = []
      while len(lamda_list) < 30:
          x = np.random.uniform(0.01, 0.1)
          lamda_list.append(x)
      
      for lamda_i in lamda_list:
            print("Lambda = ", lamda_i)

            # Train network
            acc_train, acc_val, acc_test = net.mini_batch_gd(
                  X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, 
                  Y_test,y_test,labda=lamda_i,batch_s=100,learning_rate_min=1e-5, 
                  learning_rate_max=1e-1,stepsize=2250,n_epochs=10, verbose=True)

            print("___________________________")


    def checking_grads_with_BN(self, shapes, acts):
      preprocessor = Preprocessor()
      X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, labels = preprocessor.train_one_batch()
      layers = preprocessor.make_layers(shapes=shapes, activations=acts)
      K, D, N, C, labels = preprocessor.find_parameters(X_train[:30, :5], Y_train[:30, :5])
      
      net = NeuralNetwork(K, D, N, C, labels, layers, batch_norm=True)

      grads_ana = net.compute_gradients(X_train[:30, :5], Y_train[:30, :5],
                                        labda = 0)

      grads_num = net.compute_gradients_num(X_train[:30, :5], Y_train[:30, :5],
                                          labda=0)

      net.check_gradients(grads_ana, grads_num)

    def training(self, shapes, activations, batch_norm, labda=0.021):
        """Train a model"""

        preprocessor = Preprocessor()
        X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, labels = preprocessor.train_one_batch()
        layers = preprocessor.make_layers(shapes=shapes, activations=activations)
        K, D, N, C, labels = preprocessor.find_parameters(X_train[:30, :5], Y_train[:30, :5])

        acc_train_set = []
        acc_val_set = []
        acc_test_set = []
        for j in range(10):
          print("\nLoop: ", j)
          net = NeuralNetwork(K, D, N, C, labels, layers, batch_norm=batch_norm)
          acc_train, acc_val, acc_test = net.mini_batch_gd(
                  X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, 
                  Y_test, y_test, labda=labda, batch_s=100, learning_rate_min=1e-5, 
                  learning_rate_max=1e-1, stepsize=2250, n_epochs=10, verbose=True)

          acc_train_set.append(acc_train)
          acc_val_set.append(acc_val)
          acc_test_set.append(acc_test)

        print("\nTrain mean accuracy:" + str(statistics.mean(acc_train_set)))
        print("Val mean accuracy:" + str(statistics.mean(acc_val_set)))
        print("Test mean accuracy:" + str(statistics.mean(acc_test_set)))
        print("Train stdev accuracy:" + str(statistics.stdev(acc_train_set)))
        print("Val stdev accuracy:" + str(statistics.stdev(acc_val_set)))
        print("Test stdev accuracy:" + str(statistics.stdev(acc_test_set)))

Overwriting build_tests.py


In [0]:
%pycat k_layer_network.py

In [34]:
%%writefile k_layer_network.py
import pickle
import matplotlib.pyplot as plt
import numpy as np
import unittest
import statistics
import re
import csv
from collections import OrderedDict
import scipy.misc

np.seterr(all='warn')
np.seterr(divide='ignore', invalid='ignore')

class NeuralNetwork():
    
    def __init__(self,K,D,N,C,labels,layers,std=1e-1,alpha=0.8,batch_norm=True):
        self.K = K
        self.D = D
        self.N = N
        self.C = C
        self.labels     = labels
        self.layers     = layers
        self.k          = len(layers) - 1
        self.alpha      = alpha
        self.batch_norm = batch_norm
        self.mu_av, self.var_av = [], []

        if self.batch_norm:
          print("Running Model with Batch normalization")
        else:
          print("Running Model without Batch normalization")

        self.activation_funcs = {'relu': self.compute_relu, 'softmax': self.compute_softmax}

        self.W, self.b, self.gamma, self.beta = [], [], [], []
        self.activations = []

        for layer in layers.values():
            for key, val in layer.items():
                if key == "shape":
                    W      = np.random.normal(0, std, size=(val[0], val[1]))
                    b      = np.zeros(val[0]).reshape(val[0], 1)
                    gamma  = np.ones((val[0], 1))
                    beta   = np.zeros((val[0], 1))
                    mu_av  = np.zeros((val[0], 1))
                    var_av = np.zeros((val[0], 1))
                    self.W.append(W)
                    self.b.append(b)
                    self.gamma.append(gamma)
                    self.beta.append(beta)
                    self.mu_av.append(mu_av)
                    self.var_av.append(var_av)
                elif key == "activation":
                    self.activations.append((val, self.activation_funcs[val]))

        if self.batch_norm:
            self.params = {"W": self.W, "b": self.b, "gamma": self.gamma,
                    "beta": self.beta}
        else:
            self.params = {"W": self.W, "b": self.b}


    def compute_softmax(self, x):
        e = x - np.max(x)
        return np.exp(e) / np.sum(np.exp(e), axis=0)


    def compute_relu(self, x):
        result = np.maximum(x, 0)
        return result

    def batch_normalize(self, s, mu, var, epsilon):
      s_norm = np.zeros(np.shape(s))

      a = var + epsilon
      a = np.power(a, (-1 / 2))
      a = np.diag(a)

      s_norm = (s - mu) / np.sqrt(a)

      return s_norm
    
    def compute_ce_loss(self, Y, P):
      """
          compute cross entropy loss
      """

      p_one_hot = np.sum(np.prod((np.array(Y), P), axis=0), axis=0)
      loss = np.sum(0 - np.log(p_one_hot))

      return loss

    def compute_cost(self, X, Y, labda):
      """Find the cost of the neural network

            X    : array with (D, N) dimensions
            Y    : one-hot encoding labels array (C, N)
            lamda: the regularization term
      """
      n = X.shape[1] # number of images

      if self.batch_norm:
        H, P, S, S_hat, means, variances = self.evaluate_classifier(X)
      else:
        H, P = self.evaluate_classifier(X)

      # cross entropy loss function
      loss = self.compute_ce_loss(Y, P) / n

      # regularization term
      regularization = 0
      for W_i in self.W:
          regularization += (np.sum(np.square(W_i)))

      # compute cost
      cost = loss + labda * regularization

      return cost, loss


    def compute_accuracy(self, X, y, is_testing=False):
        """ Computes the accuracy of the classifier"""
        
        if self.batch_norm:
            P = self.evaluate_classifier(X, is_testing=is_testing)
            predictions = np.argmax(P[1], axis=0)
        else:
            P = self.evaluate_classifier(X)
            predictions = np.argmax(P[1], axis=0)
        
        corrects = np.where(predictions - np.asarray(y) == 0)
        num_of_corrects = len(corrects[0])
        accuracy = num_of_corrects / X.shape[1]

        return accuracy


    def evaluate_classifier(self, X, is_testing=False, is_training=True):
        N = X.shape[1]
        s = np.copy(X)

        if self.batch_norm:
            # Run Forward Pass of Batch Normalization
            H, P, S, S_hat, means, variances = self.forward_BN(X, is_training, is_testing)
            return H, P, S, S_hat, means, variances
        else:
            H = []
            P = []
            layer_counter = 1
            last_layer = len(self.layers)

            for W, b in zip(self.W, self.b):
                if layer_counter != last_layer:
                    s = self.compute_relu(np.dot(W,s) + b)
                    H.append(s)
                else:
                    P = self.compute_softmax(np.dot(W,s) + b)

                layer_counter += 1

            return H, P

    def forward_BN(self, X, is_training, is_testing):
      S, S_hat, means, variances, H = [], [], [], [], []
      s=np.copy(X)
      layer_counter = 1
      last_layer = len(self.layers)

      for i, (W, b, gamma, beta, mu_av, var_av) in enumerate(
              zip(self.W, self.b, self.gamma, self.beta, self.mu_av,
                  self.var_av)):

          H.append(s)
          s = np.dot(W,s) + b

          if layer_counter < last_layer:
              S.append(s)
              if is_testing:
                  s = (s - mu_av) / np.sqrt(var_av + \
                          np.finfo(np.float64).eps)

              else:
                  mu = np.mean(s, axis=1, keepdims=True)
                  means.append(mu)
                  var = np.var(s, axis=1, keepdims=True) * (self.N-1)/self.N
                  variances.append(var)

                  if is_training:
                      self.mu_av[i]  = self.alpha * mu_av + \
                              (1-self.alpha) * mu
                      self.var_av[i] = self.alpha * var_av + \
                              (1-self.alpha) * var

                  s = self.batch_normalize( s, mu, var, np.finfo(np.float64).eps)

              S_hat.append(s)
              s = self.compute_relu(np.multiply(gamma, s) + beta)

          else:
              P = self.compute_softmax(s)
          
          layer_counter += 1

      return H, P, S, S_hat, means, variances
    
    def backward_BN(self, Y_batch, H_batch, P_batch, S_batch, labda, S_hat_batch,
                    means_batch, vars_batch, N, grads):
      e = np.finfo(np.float64).eps
      last_layer = len(self.layers) - 1

      G_batch = - (Y_batch - P_batch)

      # For the last layer
      grads["W"][last_layer] = 1/N * np.dot(G_batch,H_batch[last_layer].T) + \
                    2 * labda * self.W[last_layer]
      grads["b"][last_layer] = np.reshape(1/N * np.dot(G_batch,np.ones(N)),
              (grads["b"][last_layer].shape[0], 1))
      
      G_batch = np.dot(self.W[last_layer].T, G_batch)
      H_batch[last_layer][H_batch[last_layer] <= 0] = 0
      G_batch = np.multiply(G_batch, H_batch[last_layer] > 0)

      # Loop hidden layers
      for l in range(self.k - 1, -1, -1):
          grads["gamma"][l] = np.reshape(1/N * np.dot(np.multiply(G_batch,
              S_hat_batch[l]),np.ones(N)), (grads["gamma"][l].shape[0], 1))
          grads["beta"][l]  = np.reshape(1/N * np.dot(G_batch, np.ones(N)),
                  (grads["beta"][l].shape[0], 1))

          G_batch = np.multiply(G_batch, self.gamma[l])

          G_batch = self.batch_norm_back_pass(G_batch, S_batch[l],
                  means_batch[l], vars_batch[l], e)

          grads["W"][l] = 1/N * np.dot(G_batch,H_batch[l].T) + 2 * labda * self.W[l]

          grads["b"][l] = np.reshape(1/N * np.dot(G_batch, np.ones(N)),
                                  (grads["b"][l].shape[0], 1))
          if l > 0:
              G_batch = np.dot(self.W[l].T, G_batch)
              H_batch[l][H_batch[l] <= 0] = 0
              G_batch = np.multiply(G_batch, H_batch[l] > 0)

      return grads

    def compute_gradients(self, X_batch, Y_batch, labda):
        """Analytically computes the gradients of the weight and bias parameters
        Args:
            X_batch (np.ndarray): data batch matrix (D, N)
            Y_batch (np.ndarray): one-hot-encoding labels batch vector (C, N)
            labda        (float): regularization term
        Returns:
            grads (dict): the updated analytical gradients
        """
        N = X_batch.shape[1]
        e = np.finfo(np.float64).eps

        if self.batch_norm:
            grads = {"W": [], "b": [], "gamma": [], "beta": []}

            # Initialize networks parameters
            for key in self.params:
                for par in self.params[key]:
                    grads[key].append(np.zeros_like(par))

            # Forward pass
            H_batch, P_batch, S_batch, S_hat_batch, means_batch, vars_batch = \
                    self.evaluate_classifier(X_batch)

            # Backward pass
            grads = self.backward_BN(Y_batch, H_batch, P_batch, S_batch, labda, \
                                S_hat_batch, means_batch, vars_batch, N, grads)

        else:
            grads = {"W": [], "b": []}
            for W, b in zip(self.W, self.b):
                grads["W"].append(np.zeros_like(W))
                grads["b"].append(np.zeros_like(b))

            # Forward pass
            H_batch, P_batch = self.evaluate_classifier(X_batch)

            # Backward pass
            G_batch = - (Y_batch - P_batch)

            # Loop layers
            for l in range(len(self.layers) - 1, 0, -1):
                grads["W"][l] = 1/N * np.dot(G_batch, H_batch[l-1].T) \
                                    + 2 * labda * self.W[l]
                grads["b"][l] = np.reshape(1/N * np.dot(G_batch, np.ones(N)),
                        (grads["b"][l].shape[0], 1))

                G_batch = np.dot(self.W[l].T, G_batch)
                H_batch[l-1][H_batch[l-1] <= 0] = 0
                G_batch = np.multiply(G_batch, H_batch[l-1] > 0)

            grads["W"][0] = ((np.dot(G_batch, np.transpose(X_batch))) / N) \
                              + labda * self.W[0]
            grads["b"][0] = np.reshape(np.asarray((np.dot(G_batch, np.ones(N))))/N,\
                                       self.b[0].shape)
        return grads


    def batch_norm_back_pass(self, G_batch, S_batch, mean_batch, var_batch, e):
        """Computation of the batch normalization back pass
        Args:
            G_batch    : gradients of the batch
            S_batch    : linear transformations of the batch
            mean_batch : mean vectors of the batch
            var_bath   : variance vectors of the batch
        Returns:
            G_batch : batch normalized gradients
        """
        N = G_batch.shape[1]
        sigma1 = np.power(var_batch + e, -0.5) 
        sigma2 = np.power(var_batch + e, -1.5) 

        G1 = np.multiply(G_batch, sigma1)
        G2 = np.multiply(G_batch, sigma2)

        D = S_batch - mean_batch

        c = np.sum(np.multiply(G2, D), axis=1, keepdims=True)

        G_batch = G1 - 1/N * np.sum(G1, axis=1, keepdims=True) - \
                1/N * np.multiply(D, c)

        return G_batch


    def compute_gradients_num(self, X_batch, Y_batch, size=2,
            labda=np.float64(0), h=np.float64(1e-7)):
        """Numerically computes the gradients of the weight and bias parameters
        Args:
            X_batch : data batch matrix (D, N)
            Y_batch : one-hot-encoding labels batch vector (C, N)
            W       : the weight matrix
            b       : the bias matrix
            labda   : penalty term
            h       : marginal offset
        Returns:
            grads  (dict): the numerically gradients
        """
        if self.batch_norm:
            grads = {"W": [], "b": [], "gamma": [], "beta": []}
        else:
            grads = {"W": [], "b": []}

        for j in range(len(self.b)):
            for key in self.params:
                grads[key].append(np.zeros(self.params[key][j].shape))
                for i in range(len(self.params[key][j].flatten())):
                    old_par = self.params[key][j].flat[i]
                    self.params[key][j].flat[i] = old_par + h
                    _, c2 = self.compute_cost(X_batch, Y_batch, labda)
                    self.params[key][j].flat[i] = old_par - h
                    _, c3 = self.compute_cost(X_batch, Y_batch, labda)
                    self.params[key][j].flat[i] = old_par
                    grads[key][j].flat[i] = (c2-c3) / (2*h)

        return grads


    def check_gradients(self, grads_analyt, grads_num):
        """Compute the relative error between analytical and numerical grads"""

        layers = len(self.layers)

        for l in range(layers):
            for key in grads_analyt:
                num = abs(grads_analyt[key][l].flat[:] - grads_num[key][l].flat[:])
                denominator = np.asarray([max(abs(a), abs(b)) + 1e-10 for a,b in
                    zip(grads_analyt[key][l].flat[:], grads_num[key][l].flat[:])])
                max_rel_err = max(num / denominator)
                print("The relative error for layer %d %s: %.6g" %
                        (l+1, key, max_rel_err))


    def visualization_per_epoch(self, n_epochs,arg1,arg2,title,y_label,min,max):
       """ Plots 
        
            n_epochs       (int): number of training epochs
            i_train (np.ndarray): input to plot per epoch on the training set
            i_val   (np.ndarray): input to plot per epoch on the validation set
            title          (str): plot title
            y_label        (str): y-axis label
       """
       epochs = np.arange(n_epochs)

       fig, ax = plt.subplots(figsize=(10, 8))
       ax.plot(epochs, arg1, label="Training set")
       ax.plot(epochs, arg2, label="Validation set")
       ax.legend()
       ax.set(xlabel='Number of epochs', ylabel = y_label)
       ax.set_ylim([min, max])
      #  ax.set_xlim([0, 8])
       ax.grid()

       plt.savefig("plots/" + title + ".png", bbox_inches="tight")


    def cyclical_learning_rate(self, learning_rate, stepsize, learning_rate_min, learning_rate_max, t):

        if t <= stepsize:
            learning_rate = learning_rate_min + t / stepsize * (learning_rate_max - learning_rate_min)

        elif t <= 2 * stepsize:
            learning_rate = learning_rate_max - (t - stepsize) / stepsize * (
                        learning_rate_max - learning_rate_min)

        return learning_rate


    def mini_batch_gd(self, X, Y, y, X_val, Y_val, y_val, X_test, Y_test, y_test, 
                      labda=0, batch_s=100, learning_rate_min=1e-5, 
                      learning_rate_max=1e-1, stepsize=800, n_epochs=40, 
                      plot_id="", verbose=True, plot=True,is_testing=True, is_training=True):
      """
            TRAIN the model using mini-batch gradient descent
      """
    
      train_cost = []
      train_loss = []
      train_accuracy = []
      val_cost = []
      val_loss = []
      val_accuracy = []
      test_cost = []
      test_loss = []
      test_accuracy = []

      n_batch = int(np.floor(X.shape[1] / batch_s))
      learning_rate_current = learning_rate_min
      N = np.shape(X)[1]
      images_per_batch = int(N / n_batch)
      t = 0

      #  Generate the set of mini - batches and do the Gradient Descent
      for epoch in range(n_epochs):
        for batch in range(n_batch):
              j_start = (batch) * images_per_batch
              j_end = (batch+1) * images_per_batch

              X_batch = X[:, j_start:j_end]
              Y_batch = Y[:, j_start:j_end]

              grads = self.compute_gradients(X_batch, Y_batch, labda)

              for key in self.params:
                  for par, grad in zip(self.params[key], grads[key]):
                      par -= learning_rate_current * grad

              # Apply cyclical learning rates
              learning_rate_current = self.cyclical_learning_rate(learning_rate_current,
                            stepsize, learning_rate_min, learning_rate_max, t)

              t = (t+1) % (2*stepsize)

        # TRAINING
        # compute accuracy per epoch and save the results in list
        epoch_accuracy = self.compute_accuracy(X, y)
        train_accuracy.append(epoch_accuracy)

        # compute cost per epoch and save the results in list
        epoch_cost, epoch_loss = self.compute_cost(X, Y, labda)
        train_cost.append(epoch_cost)
        train_loss.append(epoch_loss)

        # CROSS VALIDATION
        # compute accuracy per epoch and save the results in list
        val_epoch_accuracy = self.compute_accuracy(X_val, y_val)
        val_accuracy.append(val_epoch_accuracy)

        # compute cost per epoch and save the results in list
        val_epoch_cost, val_epoch_loss = self.compute_cost(X_val, Y_val, labda)
        val_cost.append(val_epoch_cost)
        val_loss.append(val_epoch_loss)

        # TEST
        # compute accuracy per epoch and save the results in list
        test_epoch_accuracy = self.compute_accuracy(X_test, y_test)
        test_accuracy.append(test_epoch_accuracy)

        # compute cost per epoch and save the results in list
        test_epoch_cost, test_epoch_loss = self.compute_cost(X_val, Y_val, labda)
        test_cost.append(test_epoch_cost)
        test_loss.append(test_epoch_loss)

      print("Validation cost = ", val_cost)
      print("Validation loss = ", val_loss )
      print("Training cost = ", train_cost)
      print("Training loss = ", train_loss )

      if plot:
          self.visualization_per_epoch(n_epochs, train_cost, val_cost, plot_id + "_cost_plot",
                    y_label="Cost", min = 0, max = 12)
          
          self.visualization_per_epoch(n_epochs, train_loss, val_loss, plot_id + "_loss_plot",
                    y_label="Loss", min = 0, max = 4)
          

      acc_train = self.compute_accuracy(X, y)
      acc_val   = self.compute_accuracy(X_val, y_val)
      acc_test  = self.compute_accuracy(X_test, y_test, is_testing=True)

      if verbose:
          print("The accuracy on the training set is: "   + str(acc_train))
          print("The accuracy on the validation set is: " + str(acc_val))
          print("The accuracy on the testing set is: "    + str(acc_test))

      return acc_train, acc_val, acc_test



Overwriting k_layer_network.py


In [33]:
!python run_tests.py

START CHECKING SENSITIVITY TO INITIALIZATION WITH BN
X_train shape =  (3072, 10000)
Y_train shape =  (10, 10000)
X_val shape =  (3072, 10000)
Y_val shape =  (10, 10000)
X_test shape =  (3072, 10000)
Y_test shape =  (10, 10000)
Layer  0  =  {'shape': (50, 3072), 'activation': 'relu'}
Layer  1  =  {'shape': (50, 50), 'activation': 'relu'}
Layer  2  =  {'shape': (10, 50), 'activation': 'softmax'}
Running Model with Batch normalization
Validation cost =  [2.2728257836058052, 2.28020441010398, 2.298484302052044, 2.329112465485785, 2.357267598634814, 2.3895846611500584, 2.481161554111369, 2.526067215616488, 2.594899968982956, 2.6200768199251607, 2.722719741505192, 2.709310766961196, 2.781001215149464, 2.760049104779953, 2.72060401334313, 2.6967357868854536, 2.77111106001834, 2.7673877323038703, 2.7592425781126666, 2.757058128532006]
Validation loss =  [2.272299020493205, 2.2756439739890966, 2.285645181073142, 2.3047076620179867, 2.316722872088379, 2.32726546814844, 2.392977290308822, 2.41103