<a href="https://colab.research.google.com/github/RyanSmoak/MNIST-from-scratch/blob/main/MNIST_full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [195]:
!pip install scipy



In [196]:
import numpy as np
from scipy import signal

In [197]:
class Conv2DLayer:
  '''
    This class creates the convolutional layer, the most basic and important layer in our CNN.
  '''
  def __init__(self, input_shape, filter_shape, stride, padding):
    '''
      This function initializes the convolutional layer.
      Inputs:
        - input_shape: The shape of the input structure (depth, height, width)
        - filter_shape: The shape of the filter (num_filters, depth, height, width)
        - stride: The number of steps the filter takes at each iteration
        - padding: The amount of zero padding to be added to the input structure
    '''
    self.input_shape = input_shape
    self.input_depth = input_shape[0]

    self.num_filters = filter_shape[0]
    self.filter_size = filter_shape[2]
    self.stride = stride
    self.padding = padding

    #self.output_shape = (depth, input_height - filter_size + 1, input_width - filter_size + 1)
    fan_in = input_shape[0] * filter_shape[2] * filter_shape[2]
    self.filter_shape = filter_shape
    self.filters = np.random.randn(filter_shape[0], input_shape[0], filter_shape[2], filter_shape[2])/np.sqrt(fan_in/2)
    self.biases = np.zeros(filter_shape[0])

  def zero_pad(self, input_data):
    '''
    This function pads the input with zeros to a certain degree.
    Input:
      - input_data: 2D array (H_1 x W_1)
    Output:
      - padded_input: 2D  padded array (H_1+padding X W_2+padding)
    '''

    batch_size, D_1, H_1, W_1 = input_data.shape
    #create a padded array of zeros
    padded_input = np.zeros((batch_size, D_1, H_1 + (2 * self.padding), W_1 + (2 * self.padding)))
    #copy the input structure into the centre of the padded array
    padded_input[:, :, self.padding:self.padding + H_1, self.padding:self.padding + W_1] = input_data

    return padded_input

  def stride_fun(self, input, h, w, filter_size):
    '''
    This function is meant to slide the filter along the structure volume a certain number of steps a
    at each iteration.
    Inputs:
      - h: The height of the structure
      - w: The width of the structure
      - filter_size: The width and/or the height of the filter
    Output:
      - The input slice at a given iteration
    '''
    stride = self.stride
     # Calculate the top-left corner of the current window
    h_start = h * stride
    w_start = w * stride

    F_w = filter_size
    F_h = filter_size
    # Extract and return the input slice
    return input[:, h_start:h_start + F_h, w_start:w_start + F_w]

  def forward(self, input_data):
    '''
    This function does the actual convolution process that we described ealier in this cookbook.
    '''
    self.input_data = input_data
    stride_num = self.stride
    padding = self.padding
    filter_size = self.filter_size
    filters = self.filters
    biases = self.biases

    (C_in, H_in, W_in) = self.input_shape
    batch_size = input_data.shape[0]
    (num_filters, C_in_filter, F_h, F_w) = self.filter_shape

    H_out = int(np.floor((H_in + 2*padding - F_h) / stride_num + 1))
    W_out = int(np.floor((W_in + 2*padding - F_w) / stride_num + 1))

    # Initialize the output tensor
    output = np.zeros((batch_size, num_filters, H_out, W_out))

    self.padded_input = self.zero_pad(input_data)

    # Perform convolution
    for b in range(batch_size):
      for n in range(num_filters):  # Loop over each filter
          for h in range(H_out):  # Loop over output height
              for w in range(W_out):  # Loop over output width
                  input_slice = self.stride_fun(self.padded_input[b], h, w, filter_size)

                  # Perform dot product
                  output[b, n, h, w] = np.sum(input_slice * filters[n]) + biases[n]

    return output

  def backward(self, output_gradient, optimizer):
    '''
    This function performs the backward pass to the network that will
    '''
    filters_gradient= np.zeros(self.filter_shape)
    input_gradient = np.zeros(self.padded_input.shape)
    bias_gradient = np.mean(output_gradient, axis=(0,2,3))

    #compute the gradients
    for b in range(self.input_data.shape[0]):
      for i in range (self.num_filters):
        for j in range (self. input_depth):
          filters_gradient[i, j] += signal.correlate2d(self.input_data[b,j], output_gradient[b,i], "valid")
          input_gradient[b,j] += signal.convolve2d(output_gradient[b,i], self.filters[i, j], "full")

    self.filters, self.biases = optimizer.update(
        self.filters, self.biases, filters_gradient, bias_gradient
    )

    return input_gradient

In [198]:
class MaxPool2D():
  def __init__(self, pool_size, stride, padding):
    self.pool_size = pool_size
    self.stride = stride
    self.padding = padding
    self.input_shape = None
    self.input_map = None

  def forward(self, feature_map):
    '''
    This function is meant to act as the pooling layer after a Conv2D layer
    Inputs:
      feature_map: This is the output of the Conv2D layer
      pool_size: This is the size of the pooling filter
      stride: This the steps to be taken by the filter
      padding: This is the amount of zero padding to be added
    Output:
      output: This is the input structure with reduced spatial dimanesions of the pooling layer
    '''
    #account for any padding that may be added

    self.input_shape = feature_map.shape
    self.input_map = feature_map

    if self.padding > 0:
          feature_map = np.pad(feature_map,
                        ((0,0), (0,0), (self.padding, self.padding), (self.padding, self.padding)),
                        mode='constant',
                        constant_values=0)

    #Get the shapes for the input and the pooling filter
    batch_size, channels, H_in, W_in = feature_map.shape
    pool_h, pool_w = self.pool_size

    #calculate the output size
    W_out = (W_in - pool_w) // self.stride + 1
    H_out = (H_in - pool_h) // self.stride + 1

    H_out = max(1, H_out)
    W_out = max(1, W_out)

    self.max_indices = np.zeros_like(feature_map)

    #initialize the output
    output_map = np.zeros((batch_size, channels, H_out, W_out))

    #create the window and sliding
    for b in range(batch_size):
      for c in range(channels):
        for i in range(H_out):
          for j in range(W_out):
            #slide the window
            window = feature_map[b, c, i*self.stride : i*self.stride+pool_h,
                                j*self.stride : j*self.stride+pool_w]
            #perform max pooling
            output_map[b, c, i, j] = np.max(window)
            max_idx = np.unravel_index(np.argmax(window), window.shape)
            self.max_indices[b, c, i+max_idx[0], j+max_idx[1]] = 1


    return output_map

  def backward(self, dL_dOutput):
    '''
    This function is meant to propagate the feature map with repect to the input.
    Basically give us our gradient the same shape as the input
    Input:
      dL_dOutput: Gradients passed in the backprop
    '''
    dL_dInput = np.zeros_like(self.input_map)
    batch_size, channels, out_height, out_width = dL_dOutput.shape

    for b in range(batch_size):
        for c in range(channels):
            for i in range(out_height):
                for j in range(out_width):
                    # Find the corresponding window in the input
                    window_start_i = i * self.stride
                    window_start_j = j * self.stride
                    window_end_i = window_start_i + self.pool_size[0]
                    window_end_j = window_start_j + self.pool_size[0]

                    # Only the position of the max value gets the gradient
                    max_mask = self.max_indices[b, c, window_start_i:window_end_i, window_start_j:window_end_j]
                    dL_dInput[b, c, window_start_i:window_end_i, window_start_j:window_end_j] += dL_dOutput[b, c, i, j] * max_mask

    return dL_dInput

In [199]:
class PReLU():
  def __init__(self, alpha=0.01):
    '''
      This function initializes the PReLU activation function. Seriously, initialize this stuff, makes life so much easier.
    '''
    self.alpha = alpha
    self.alpha_grad = None
    self.prelu_input = None

  def forward(self, prev_layer_input):
    '''
      This function does the forward pass of the PReLU activation function.
      Input:
        prev_layer_input: The input from the previous layer, be it the conv2D or the FC layer.
      Output:
        The output of the PReLU activation function.
    '''
    #storing the input with self to use in the backprop function
    self.prelu_input = prev_layer_input
    return np.where(prev_layer_input > 0, prev_layer_input, self.alpha * prev_layer_input)

  def backward(self, dy):
    '''
      This function does the backward pass of the PReLU activation function.
      Input:
        dy: The gradient of the loss function with respect to the output of the PReLU activation function.
      Output:
        dx: The gradient of the loss function with respect to the input of the PReLU activation function.
    '''
    if self.prelu_input is None:
      raise ValueError("Input to PReLU activation function has not been computed yet.")

    # Gradient of the activation with respect to the input
    dx = np.where(self.prelu_input > 0, dy, self.alpha * dy)

    # Gradient of alpha: sum of the gradients where input <= 0
    self.alpha_grad = np.sum(dy * self.prelu_input * (self.prelu_input <= 0))

    return dx

class Softmax():
  def __init__(self):
    self.output = None

  def softmax(self, logits):
    '''
    This function is compute the class probabilites for a certain data point
    using softmax activation function
    Input:
      x: The input array of size (batch_size, num_classes)
    Output:
      The class probabilites for the data point
    '''
    self.logits = logits
    exp_x = np.exp(self.logits - np.max(self.logits, axis=0, keepdims=True))
    return exp_x / np.sum(exp_x, axis=0, keepdims=True)


In [200]:
class SGD_NAG:
    def __init__(self, learning_rate, momentum):
        '''
        Always have an __init__ function in your class, seriously, do it.
        Inputs:
          learning_rate: The step size for the optimization.
          momentum: The momentum coefficient.
        '''

        self.learning_rate = learning_rate
        self.momentum = momentum
        self.velocity_filters = None
        self.velocity_biases = None

    def update(self, filters, biases, filters_grads, biases_grads):
        '''
        After calculating the gradients in the backpass, this function
        will update the parameters using Nesterov Accelerated Gradient.
        Inputs:
          filters: The current filters for the convLayer
          biases: The biases for the convLayer
          filters_grads: The gradients of the filters
          biases_grads: The gradients of the biases
        Output:
          Updated filters and biases.
        '''
        self.filters = filters
        self.biases = biases
        #Initalize the filter and biases velocity
        if self.velocity_filters is None:
            self.velocity_filters = np.zeros_like(filters)
        if self. velocity_biases is None:
            self.velocity_biases = np.zeros_like(biases)

        #lookahead for the filters and biases
        lookahead_filters = self.filters - self.momentum * self.velocity_filters
        lookahead_biases = self.biases - self.momentum * self.velocity_biases

        #update velocities
        self.velocity_filters = self.momentum * self.velocity_filters - self.learning_rate * filters_grads
        self.velocity_biases = self.momentum * self.velocity_biases - self.learning_rate * biases_grads

        #update parameters
        updated_filters = lookahead_filters + self.velocity_filters + self.momentum * (self.velocity_filters - self.momentum * self.velocity_filters)
        updated_biases = lookahead_biases + self.velocity_biases + self.momentum * (self.velocity_biases - self.momentum * self.velocity_biases)

        return updated_filters, updated_biases

In [201]:
class Flatten:
  def __init__(self):
    '''
    An initialization function for the flatten layer.
    I think by now you should know what I'm going to say for this
    '''
    self.input_shape = None #to store the original shape for unflatening

  def flatten(self, input_tensor):
    '''
    Reshape the input tensor into a 1D vector.
    Input:
      input_tensor: The input tensor to be flattened.
    Output:
      The flattened tensor.
    '''
    self.input_shape = input_tensor.shape
    return input_tensor.reshape(-1)

  def unflatten(self, output_gradient):
    '''
    Reshape the output gradient from the FC layer into original shape
    Input:
      output_gradient: The output gradient from the FC layer.
    Output:
      The unflattened gradient.
    '''
    return output_gradient.reshape(self.input_shape)

In [202]:
class FullyConnectedLayer:
    def __init__(self, input_neurons_num, output_neurons_num, activation_function, activation_derivative):
        '''
        Initialize the fully connected layer. No comment!
        Input:
          input_neurons_num: Number of input neurons.
          output_neurons_num: Number of output neurons.
          activation_function: The activation function to apply.
          activation_derivative: The derivative of the activation function.
        '''
        self.input_size = input_neurons_num
        self.output_size = output_neurons_num
        self.activation_function_instance = activation_function
        self.activation_derivative_instance = activation_derivative

        # Initialize weights and biases with He initialization
        self.weights = np.random.randn(output_neurons_num, input_neurons_num) * np.sqrt(2 / input_neurons_num)
        self.biases = np.zeros((output_neurons_num, 1))
        self.z = None
        self.activated_output = None

    def forward(self, input_data):
        '''
        Perform the forward pass through the FC layer.
        Input:
          input_data: Input data of shape (input_neurons_num, batch_size).
        Output:
          Output data after applying the activation function.
        '''
        input_data = input_data.reshape(self.input_size, -1)
        self.input = input_data  # Store input for use in backward pass
        self.z = np.dot(self.weights, input_data) + self.biases

        if self.activation_function_instance is not None:
          if isinstance(self.activation_function_instance, PReLU):
            self.activated_output = self.activation_function_instance.forward(self.z)
          else:
              self.activated_output = self.activation_function_instance(self.z)
          return self.activated_output
        else:
          self.activated_output = self.z

          return self.activated_output

    def backward(self, output_gradient, learning_rate):
        '''
        Perform the backward pass through the FC layer.
        Input:
          output_gradient: Gradient of the loss w.r.t. the output (a) of this layer.
          learning_rate: Learning rate for weight updates.
        Output:
          dx Gradient of the loss w.r.t. the input (x) of this layer.
        '''

        # Compute gradient w.r.t. z
        if self.activation_derivative_instance is not None:
          if isinstance(self.activation_function_instance, PReLU):
              dz = output_gradient * self.activation_derivative_instance.backward(self.activated_output)
          else:
              dz = output_gradient * self.activation_derivative_instance(self.activated_output)

          #dz = output_gradient * self.activation_derivative(self.activated_output)  # Element-wise multiplication
        else: dz = output_gradient

        # Compute gradients
        dw = np.dot(dz, self.input.T) / dz.shape[1]  # Weight gradient
        db = np.sum(dz, axis=1, keepdims=True) / dz.shape[1]  # Bias gradient

        # Update weights and biases
        self.weights -= learning_rate * dw
        self.biases -= learning_rate * db  # Broadcasting now works correctly

        # Compute input gradient
        dx = np.dot(self.weights.T, dz)
        return dx



In [203]:
class losses():
  def __init__(self, y_true, y_pred):
    '''
    Initialize the loss function.
    Input:
      y_true: The true labels.
      y_pred: The predicted labels.
    '''
    self.y_true = y_true
    self.y_pred = y_pred

  def binary_cross_entropy(self):
    '''
    Compute the binary cross-entropy loss.
    Output:
      The binary cross-entropy loss.
    '''
    return -(1/self.m) * np.sum(self.y_true * np.log(self.y_pred) + (1 - self.y_true) * np.log(1 - self.y_pred))


  def cross_entropy(self):
    '''
    Compute the cross-entropy loss for multi-class classification.
    Output:
      The cross-entropy loss.
    '''
    batch_size = self.y_pred.shape[0]

    # Ensure y_true is an integer array of class indices
    y_true = self.y_true.astype(int)

    if y_true.ndim > 1:
      y_true = np.argmax(y_true, axis=1)

    correct_class_probs = self.y_pred[np.arange(batch_size), y_true]
    log_probs = np.log(correct_class_probs + 1e-15)

    log_probs = log_probs[:, np.newaxis]

    return -np.mean(np.sum(self.y_true * log_probs, axis =1))

In [204]:
class BatchNormalization:
  def __init__(self, input_size, epsilon=1e-5, momentum=0.9):
    '''
    I really don't know what else I can say about initilizations.
    Input:
      input_size: batch size
      epsilon: a small positive number to avoid division by 0
      momentum: momentum coefficient (friction) when using SGD with momentum
    '''
    self.gamma = np.ones((input_size, 1))
    self.beta = np.zeros((input_size, 1))
    self.epsilon = epsilon
    self.momentum = momentum
    self.running_mean = np.zeros((input_size, 1))
    self.running_var = np.zeros((input_size, 1))

  def forward(self, x, training=True):
    '''
    This is the forward pass for batch normalization
    Input:
      training: a boolean telling us whether the network should learn gamma and beta or not
    Output:
      The normalized output
    '''
    if training:
        self.mean = np.mean(x, axis=1, keepdims=True)
        self.var = np.var(x, axis=1, keepdims=True)
        self.x_hat = (x - self.mean) / np.sqrt(self.var + self.epsilon)

        # Update running estimates
        self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * self.mean
        self.running_var = self.momentum * self.running_var + (1 - self.momentum) * self.var
    else:
        # Use running estimates for inference
        self.x_hat = (x - self.running_mean) / np.sqrt(self.running_var + self.epsilon)

    self.out = self.gamma * self.x_hat + self.beta
    return self.out

  def backward(self, dout):
    '''
    This function is meant to 'undo' the effect of the normalization by backprop
    so as to compute the gradient of x
    Input:
      dout: the gradient of a later layer in the network

    Output:
      dx: the gradient of x backpropagated through the batch normalization
    '''
    m = dout.shape[1]

    # Gradients w.r.t. gamma and beta
    self.dgamma = np.sum(dout * self.x_hat, axis=1, keepdims=True)
    self.dbeta = np.sum(dout, axis=1, keepdims=True)

    # Backprop through normalization
    dx_hat = dout * self.gamma
    dvar = np.sum(dx_hat * (self.input - self.mean) * -0.5 * (self.var + self.epsilon)**-1.5, axis=1, keepdims=True)
    dmean = np.sum(dx_hat * -1 / np.sqrt(self.var + self.epsilon), axis=1, keepdims=True) + dvar * np.mean(-2 * (self.input - self.mean), axis=1, keepdims=True)

    dx = dx_hat / np.sqrt(self.var + self.epsilon) + dvar * 2 * (self.input - self.mean) / m + dmean / m
    return dx


In [205]:
class Dropout:
    def __init__(self, keep_prob=0.5):
        self.keep_prob = keep_prob

    def forward(self, x, training=True):
        if training:
            self.mask = (np.random.rand(*x.shape) < self.keep_prob) / self.keep_prob
            return x * self.mask
        else:
            return x  # During inference, no dropout is applied

    def backward(self, dout):
        return dout * self.mask

In [206]:
class evaluation():
  def __init__(self, y_true, y_pred):
    '''
    Initialize the evaluation metrics.
    Input:
      y_true: The true class labels.
      y_pred: The predicted labels by our network.
    '''
    self.y_true = y_true
    self.y_pred = y_pred

  def accuracy(self):
    '''
    Compute the accuracy of the model.
    Output:
      The accuracy of the model.
    '''
    return np.mean(self.y_true == self.y_pred)

  def confusion_matrix(self, num_classes):
    '''
    Create a confusion matrix to show the relationship between classes.
    Output:
      The confusion matrix.
    '''
    conf_matrix = np.zeros((num_classes, num_classes), dtype=int)
    for true_label, pred_label in zip(self.y_true, self.y_pred):
        conf_matrix[true_label, pred_label] += 1
    return conf_matrix

In [207]:
class CNN():
  def __init__(self):
    self.conv1 = Conv2DLayer(input_shape=(1,28,28),
                             filter_shape = (32,1,2,2),
                             stride=1,
                             padding=1)

    self.pool1 = MaxPool2D(pool_size=(2,2), stride=2, padding=0)

    self.conv2 = Conv2DLayer(input_shape=(32,14,14),
                             filter_shape = (64,32,2,2),
                             stride=1,
                             padding=1)

    self.pool2 = MaxPool2D(pool_size=(2,2), stride=2, padding=0)
    self.flatten = Flatten()

    self.prelu_fc1 = PReLU()

    self.fc1 = FullyConnectedLayer(3136, 128,
                                   activation_function= self.prelu_fc1.forward,
                                   activation_derivative=self.prelu_fc1.backward)

    self.dropout = Dropout(keep_prob=0.5)

    self.fc2 = FullyConnectedLayer(128, 10,
                                   activation_function= Softmax().softmax,
                                   activation_derivative=None)

    self.softmax = Softmax()
    self.prelu1 = PReLU()
    self.prelu2 = PReLU()


  def forward(self, x):
    out = self.conv1.forward(x)
    out = self.prelu1.forward(out)
    out = self.pool1.forward(out)

    out = self.conv2.forward(out)
    out = self.prelu2.forward(out)
    out = self.pool2.forward(out)

    out = self.flatten.flatten(out)

    out = self.fc1.forward(out)
    out = self.prelu_fc1.forward(out)

    out = self.dropout.forward(out, training=True)

    out = self.fc2.forward(out)
    out = self.softmax.softmax(out)

    return out

  def backward(self, d_loss, learning_rate, optimizer):
    optimizer_conv1 = optimizer[0]
    optimizer_conv2 = optimizer[1]
    grad = d_loss

    grad = self.fc2.backward(grad.T, learning_rate)
    grad = self.dropout.backward(grad)

    grad = self.prelu_fc1.backward(grad)
    grad = self.fc1.backward(grad, learning_rate)
    grad = self.flatten.unflatten(grad)

    grad = self.pool2.backward(grad)
    grad = self.prelu2.backward(grad)
    grad = self.conv2.backward(grad, optimizer_conv2)

    grad = self.pool1.backward(grad)
    grad = self.prelu1.backward(grad)
    grad = self.conv1.backward(grad, optimizer_conv1)

    return grad

In [208]:
!pip install keras



In [209]:
import numpy as np
from tensorflow.keras.datasets import mnist

def load_mnist_data():
    """
    Load MNIST dataset and preprocess it.
    :return: Tuple of (train_data, train_labels, test_data, test_labels)
    """
    # Load MNIST data from tensorflow.keras
    (train_data, train_labels), (test_data, test_labels) = mnist.load_data()

    # Normalize the data to be in range [0, 1]
    train_data = train_data.astype(np.float32) / 255.0
    test_data = test_data.astype(np.float32) / 255.0

    # Reshape the data to have shape (batch_size, channels, height, width)
    train_data = train_data.reshape(-1, 1, 28, 28)
    test_data = test_data.reshape(-1, 1, 28, 28)

    # One-hot encode the labels
    train_labels = np.eye(10)[train_labels]  # One-hot encoding for 10 classes
    test_labels = np.eye(10)[test_labels]

    return train_data, train_labels, test_data, test_labels

In [None]:
import numpy as np

def train(model, train_data, train_labels, epochs, batch_size, learning_rate, optimizer):
    """
    Train the CNN model using the provided training data.

    :param model: The CNN model instance.
    :param train_data: Training data (images).
    :param train_labels: Training labels (one-hot encoded).
    :param epochs: Number of epochs to train.
    :param batch_size: Size of each training batch.
    :param learning_rate: Learning rate for parameter updates.
    :param optimizer: Optimizer instance (e.g., SGD or SGD with Nesterov momentum).
    """
    num_samples = train_data.shape[0]
    num_batches = num_samples // batch_size

    for epoch in range(epochs):
        epoch_loss = 0
        epoch_accuracy = 0

        # Shuffle data at the beginning of each epoch
        perm = np.random.permutation(num_samples)
        train_data = train_data[perm]
        train_labels = train_labels[perm]

        for i in range(num_batches):
            # Get the current batch
            batch_start = i * batch_size
            batch_end = (i + 1) * batch_size
            x_batch = train_data[batch_start:batch_end]
            y_batch = train_labels[batch_start:batch_end]

            # Forward pass: compute output and loss
            y_pred = model.forward(x_batch).transpose()
            loss_instance = losses(y_batch, y_pred)
            loss = loss_instance.cross_entropy()
            epoch_loss += loss

            # Compute accuracy for this batch
            batch_accuracy = np.mean(np.argmax(y_pred, axis=1) == np.argmax(y_batch, axis=1))
            epoch_accuracy += batch_accuracy

            # Backward pass: compute gradients and update weights
            d_loss = y_pred - y_batch  # For cross-entropy + softmax
            model.backward(d_loss, learning_rate, optimizer)


        # Average loss and accuracy for the epoch
        epoch_loss /= num_batches
        epoch_accuracy /= num_batches

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

# Example of training loop setup
if __name__ == "__main__":
    # Assuming you have a function to load MNIST dataset
    train_data, train_labels, test_data, test_labels = load_mnist_data()

    # Initialize the model and optimizer
    cnn_model = CNN()
    optimizer_conv1 = SGD_NAG(learning_rate=0.01, momentum=0.9)
    optimizer_conv2 = SGD_NAG(learning_rate=0.01, momentum=0.9)

    # Train the model
    train(cnn_model, train_data, train_labels, epochs=10, batch_size=64,
          learning_rate=0.01,
          optimizer=[optimizer_conv1, optimizer_conv2])


  return x * self.mask
  return x * self.mask
  exp_x = np.exp(self.logits - np.max(self.logits, axis=0, keepdims=True))
  self.alpha_grad = np.sum(dy * self.prelu_input * (self.prelu_input <= 0))
  self.alpha_grad = np.sum(dy * self.prelu_input * (self.prelu_input <= 0))
