## Convolution Neural Network

### index

- filter
- padding
- stride
- activation
- FC layer

### motivation

CNN의 내부동작원리의 이해를 돕기 위한 자료

### data

MNIST dataset

### reference 
- cs231n convolution neural network
- https://github.com/raphey/numpy-cnn/blob/master/nn_util.py




In [1]:
# import module
import numpy as np
import warnings
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle


### mnist data 불러오기

In [2]:
# dataset download and prepare dataset
import numpy as np
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle


def rough_print(num_arr): # 글자모양 출력
    """
    Simple way to print a 784-length number array, outputting '.' for every cell == 0 and 'X' for cells > 0
    """
    new_shape = num_arr.reshape((28, 28))
    for row in new_shape: # len(row)==28
        row_str = ""
        for entry in row: # row의 elm
            if entry > 0:
                row_str += 'X'
            else:
                row_str += '.'
        print(row_str)


def shuffle_data(data_obj, random_seed=0):
    """
    Given a data_obj with ['data'] and ['target] entries, shuffles them and returns them as separate arrays.
    """
    d = data_obj['data']
    t = data_obj['target'].reshape(-1, 1)

    return shuffle(d, t, random_state=random_seed)


def import_and_prepare_mnist_data(valid_portion=0.1, test_portion=0.1, flat=True):
    """
    Imports mnist data, shuffles it, and splits it into training, validation, and testing sets.

    If flat parameter is set to False, each image will be reshaped from (784) to (28 x 28 x 1), for convolution.

    training, validation, and testing are dicts with three keys each:
      'x': the image data
      'y_': the one-hot encoded labels
      'y_as_int': the labels as integers, for quick accuracy checking

    """

    mnist = fetch_mldata('MNIST original')
    data_size = len(mnist['data'])

    img_data, int_targets = shuffle_data(mnist)

    if not flat:
        img_data = img_data.reshape(-1, 1, 28, 28)

    scaled_data = img_data / 255.0

    int_targets = int_targets.astype(int)

    one_hots = one_hot_encode(int_targets)

    # Cutoff indices between training/validation and validation/testing
    validation_start = int((1.0 - valid_portion - test_portion) * data_size)
    testing_start = int((1.0 - test_portion) * data_size)

    train = {'x': scaled_data[:validation_start],
             'y_': one_hots[:validation_start],
             'y_as_int': int_targets[:validation_start]}

    valid = {'x': scaled_data[validation_start: testing_start],
             'y_': one_hots[validation_start: testing_start],
             'y_as_int': int_targets[validation_start: testing_start]}

    test = {'x': scaled_data[testing_start:],
            'y_': one_hots[testing_start:],
            'y_as_int': int_targets[testing_start:]}

    return train, valid, test


def initialize_weight_array(l, w, stddev=None, relu=False, sigma_cutoff=2.0):
    """
    Initializes a weight array with l rows and w columns.
    If stddev is not specified, default initialization is designed to create a variance of 1.0,
    meaning stddev is sqrt(1 / N_in). If the weight array is going to be used with relu
    activation, the default stddev will be sqrt(2 / N_in), since presumably half the neurons
    won't fire.
    sigma_cutoff determines the max number of stddevs away from 0 an initialized value can be.
    """
    if stddev is None:
        if relu:
            stddev = (2.0 / l) ** 0.5
        else:
            stddev = (1.0 / l) ** 0.5

    weights = []
    while len(weights) < l * w:
        new_rand_val = np.random.randn() * stddev
        if abs(new_rand_val) < sigma_cutoff * stddev:
            weights.append(new_rand_val)
    return np.array(weights).reshape(l, w)


def one_hot_encode(targets):
    """
    One hot encodes targets. [4] --> [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    """
    encoded_data = []
    for t in targets:
        new_t = np.zeros(10)
        new_t[int(t)] = 1.0
        encoded_data.append(new_t)
    return np.array(encoded_data)


def prediction_mse(y_actual, y_pred): # mean-square errors : regression
    """
    Returns mean-square error between actual y and predicted y.
    """
    return 0.5 * sum((y_actual[i] - y_pred[i]) ** 2 for i in range(0, len(y_actual)))


def prediction_cel(y_actual, y_pred): # cross-entropy error : classification
    """
    Returns cross-entropy loss between actual y and predicted y.
    """
    if y_actual.ndim == 1:
        y_actual = [y_actual]
        y_pred = [y_pred]
    size = len(y_actual) * len(y_actual[0]) # size=batch_size/classification
    return -1.0 / size * np.sum(y_actual * np.log(y_pred) + np.log(1.0 - y_pred) * (1.0 - y_actual))


def sigmoid(x):
    return np.ones(shape=x.shape) / (1.0 + np.exp(-x)) # shape=(x.shape)


def soft_max(z):
    if z.ndim == 1:
        z = [z]
    exp_z = np.exp(z)
    sums = np.sum(exp_z, axis=1, keepdims=True)
    return exp_z / sums


def pad_image(img_array, top_pad, bottom_pad, left_pad, right_pad):
    """
    Pads the width and height dimensions of an image array or batch of image arrays
    with zeros according to padding parameters, and returns a new padded array.
    img_array can be a single flat image with dimensions (height, width), an image
    with depth with dimensions (depth, height, width), or a batch of images with depth
    with dimensions (batch size, depth, height, width).
    """
    img_height = img_array.shape[-2]
    img_width = img_array.shape[-1]

    # Set the correct shape for the padded version for 2, 3, or 4 dimensions
    padded_shape = list(img_array.shape)
    padded_shape[-2] += top_pad + bottom_pad
    padded_shape[-1] += left_pad + right_pad

    padded_img = np.zeros(padded_shape)

    if len(img_array.shape) == 2:
        padded_img[top_pad: top_pad + img_height, left_pad: left_pad + img_width] = img_array
    elif len(img_array.shape) == 3:
        padded_img[:, top_pad: top_pad + img_height, left_pad: left_pad + img_width] = img_array
    else:
        padded_img[:, :, top_pad: top_pad + img_height, left_pad: left_pad + img_width] = img_array

    return padded_img


def flat_img_to_conv_stack(img, window_size, stride):
    """
    Given a flat image, returns a convolutional stack obtained by passing a square
    window across the image (left to right along the top, then next row down, etc).
    Each window is unrolled into a single 1-D row, and the stack has dimensions
    number_of_windows x window_size^2.
    """
    img_height, img_width = img.shape
    unrolled_window_size = window_size ** 2
    conv_stack = []

    for i in range(0, img_height - window_size + 1, stride):
        for j in range(0, img_width - window_size + 1, stride):
            conv_stack.append(img[i: i + window_size, j:j + window_size].reshape(unrolled_window_size))

    return np.array(conv_stack)


def deep_img_to_conv_stack(img, window_size, stride):
    """
    Given an image with depth, returns a convolutional stack obtained by passing a square prism
    window with matching depth across the image (left to right along the top, then next row down, etc).
    Each window prism is unrolled into a single 1-D row, and the stack has dimensions
    (number_of_windows) by (window_size^2 * depth).
    """
    img_depth, img_height, img_width = img.shape
    unrolled_window_size = window_size ** 2 * img_depth
    conv_stack = []

    for i in range(0, img_height - window_size + 1, stride):
        for j in range(0, img_width - window_size + 1, stride):
            conv_stack.append(img[:, i: i + window_size, j:j + window_size].reshape(unrolled_window_size))

    return np.array(conv_stack)

### padding

- 이미지 데이터의 rank를 잘 파악해야합니다.
- 이미지의 rank가 상황마다 어떤식으로 표현되는지 파악해야합니다.

In [3]:
# padding function definition
def pad_image(img_array, top_pad, bottom_pad, left_pad, right_pad):
    """
    Pads the width and height dimensions of an image array or batch of image arrays
    with zeros according to padding parameters, and returns a new padded array.
    
    img_array can be a single flat image with dimensions (height, width), 
    an image with depth(channel) with dimensions (depth, height, width),  
    or a batch of images with depth with dimensions (batch size, depth, height, width).
    
    ---------------------------------------------------------------------------------------
    img_array : image data
    top_pad: top space
    bottom_pad :bottom space
    left_pad : left space
    right_pad : right space
    
    
    """
    img_height = img_array.shape[-2]
    img_width = img_array.shape[-1]

    # Set the correct shape for the padded version for 2, 3, or 4 dimensions
    padded_shape = list(img_array.shape)
    padded_shape[-2] += top_pad + bottom_pad # height setting
    padded_shape[-1] += left_pad + right_pad # weight setting

    padded_img = np.zeros(padded_shape) # all zero

    if len(img_array.shape) == 2: # shape=(width, height)
        padded_img[top_pad: top_pad + img_height, left_pad: left_pad + img_width] = img_array
    elif len(img_array.shape) == 3: # shape=(depth, width, height)
        padded_img[:, top_pad: top_pad + img_height, left_pad: left_pad + img_width] = img_array
    else: # shape=(depth, channel, width, height)
        padded_img[:, :, top_pad: top_pad + img_height, left_pad: left_pad + img_width] = img_array

    return padded_img

In [4]:
def initialize_weight_array(l, w, stddev=None, relu=False, sigma_cutoff=2.0):
    """
    Initializes a weight array with l rows and w columns.
    If stddev is not specified, default initialization is designed to create a variance of 1.0,
    meaning stddev is sqrt(1 / N_in). If the weight array is going to be used with relu
    activation, the default stddev will be sqrt(2 / N_in), since presumably half the neurons
    won't fire.
    sigma_cutoff determines the max number of stddevs away from 0 an initialized value can be.
    """
    if stddev is None:
        if relu:
            stddev = (2.0 / l) ** 0.5
        else:
            stddev = (1.0 / l) ** 0.5

    weights = []
    while len(weights) < l * w:
        new_rand_val = np.random.randn() * stddev
        if abs(new_rand_val) < sigma_cutoff * stddev: # sigma cutoff가 weight를 선택함
            weights.append(new_rand_val)
    return np.array(weights).reshape(l, w)

In [5]:
class Network(object):
    """
    Base class for a neural net.
    self.layers is a list of layers going in order from input to output.
    With this structure, activation functions count as separate layers.
    self.feed_forward uses a series of layer forward_pass methods to go
    from an input into an output and also sets the input and output
    properties of the corresponding layers.
    self.feed_backward uses a series of layer backward_pass methods to
    go from output deltas backwards through the network, and modifies
    layers if applicable
    self.train trains the network.
    """
    def __init__(self, layers): # layers(레이어의 집합)을 가져온다
        self.layers = layers

    def feed_forward(self, x_in):
        self.layers[0].input = x_in
        for i in range(len(self.layers) - 1):
            self.layers[i + 1].input = self.layers[i].forward_pass()
        self.layers[-1].forward_pass()
        return self.layers[-1].output

    def feed_backward(self, delta_y_out, backprop_params):
        self.layers[-1].output_side_deltas = delta_y_out # 맨 뒷단의 layer를 가져온다
        for i in range(len(self.layers) - 1, 0, -1):
            
            self.layers[i - 1].output_side_deltas = self.layers[i].backward_pass(backprop_params)
        self.layers[0].backward_pass(backprop_params)

    @staticmethod
    def mse_cost(y_predicted, y_actual):
        """
        Returns total mean-square error for predicted values and actual values.
        """
        return ((y_actual - y_predicted)**2).mean()

In [6]:
class Classifier(Network):
    """
    Classifier network, with an accuracy method and a static cross-entropy loss method
    """

    def accuracy(self, x_input, labels_as_values): # 모델의 정확도
        correct = 0.0
        all_output = self.feed_forward(x_input)
        for logit, label in zip(all_output, labels_as_values):
            if np.argmax(logit) == label:
                correct += 1
        return correct / len(x_input)

    @staticmethod
    def cross_entropy_cost(y_predicted, y_actual): 
        """
        Returns total mean-square error for predicted values and actual values.
        Pads predicted values very close to 0.0 or 1.0 to avoid overflowing cost
        """
        epsilon = 1e-12
        y_predicted[y_predicted < epsilon] = epsilon
        y_predicted[y_predicted > 1 - epsilon] = 1 - epsilon

        size = len(y_actual) * len(y_actual[0])

        ce_cost = -1.0 / size * np.sum(y_actual * np.log(y_predicted) + np.log(1.0 - y_predicted) * (1.0 - y_actual))

        return ce_cost

### convolution layer class

- Produces a volume of size W2×H2×D2 where:

- W2=(W1−F+2P)/S+1

- H2=(H1−F+2P)/S+1 (i.e. width and height are computed equally by symmetry)
- D2=K

In [7]:
class Layer(object):
    """
    Base class for layers, which will include matrices, activation functions, and
    convolution layers.
    """
    def __init__(self):
        self.input = None
        self.output = None
        self.output_side_deltas = None
        self.input_side_deltas = None

    def forward_pass(self):
        raise NotImplementedError

    def backward_pass(self, backprop_params):
        raise NotImplementedError


In [8]:
class FullyConnectedLayer(Layer):
    """
    Fully connected layer in which input is multiplied by a trainable weight matrix
    """

    def __init__(self, rows, cols, relu=False): 
        super().__init__()
        self.shape = (rows, cols) # row,col 입력받는다
        self.w = initialize_weight_array(rows, cols, relu=relu) # W 초기화
        self.b = np.zeros(shape=(1, cols))

    def forward_pass(self):
        self.output = np.dot(self.input, self.w) + self.b
        return self.output

    def backward_pass(self, backprop_params):
        alpha_adj, lam = backprop_params
        self.input_side_deltas = np.dot(self.output_side_deltas, self.w.T) # self.output_side_deltas는 입력받는 값
        if lam:
            self.w *= (1.0 - lam * alpha_adj)
        self.w += alpha_adj * np.dot(self.input.T, self.output_side_deltas)
        self.b += alpha_adj * self.output_side_deltas.sum(axis=0)
        return self.input_side_deltas

In [9]:
class SigmoidLayer(Layer):
    """
    Sigmoid activation layer. Input and output have the same shape, as do the input-side and
    output-side deltas.
    """
    def forward_pass(self):
        self.output = 1.0 / (1.0 + np.exp(-self.input))
        return self.output

    def backward_pass(self, backprop_params):
        # Backprop parameters are not used.
        self.input_side_deltas = self.output_side_deltas * self.output * (1.0 - self.output)
        return self.input_side_deltas


class SoftmaxLayer(Layer):
    """
    Softmax activation layer, to be used right before output. Backprop is skipped entirely,
    under the assumption that this will be used with cross-entropy loss.
    """
    def forward_pass(self):
        exp_z = np.exp(self.input)
        sums = np.sum(exp_z, axis=1, keepdims=True)
        self.output = exp_z / sums
        return self.output

    def backward_pass(self, backprop_params):
        # Backprop parameters are not used
        self.input_side_deltas = self.output_side_deltas
        return self.input_side_deltas

In [10]:
class LReLULayer(Layer):
    """
    Leaky ReLU activation layer. Input and output have the same shape, as do the input-side and
    output-side deltas.
    """
    def __init__(self, a=0.01):
        super().__init__() # Layer로부터 상속받는다
        self.a = a

    def forward_pass(self):
        self.output = np.maximum(self.input, self.a * self.input)
        return self.output

    def backward_pass(self, backprop_params):
        # Backprop parameters are not used.
        pos_boolean = self.input >= 0
        self.input_side_deltas = self.a * self.output_side_deltas[:]
        self.input_side_deltas[pos_boolean] = self.output_side_deltas[pos_boolean]

        return self.input_side_deltas

In [11]:
class FullyConnectedLayerWithDropout(Layer):
    """
    Fully connected layer in which input is multiplied by a trainable weight matrix, with
    dropout that can be turned on or off.
    """

    def __init__(self, rows, cols, keep_prob, relu=False):
        super().__init__()
        self.shape = (rows, cols)
        self.w = initialize_weight_array(rows, cols, relu=relu)
        self.b = np.zeros(shape=(1, cols))
        self.keep_prob = keep_prob
        self.dropout_on = False
        self.keep_mask = None

    def forward_pass(self):
        adjusted_weight = self.w.copy()

        if self.dropout_on:
            self.keep_mask = np.random.binomial([np.ones(self.w.shape)], self.keep_prob)[0] * (1.0 / self.keep_prob)
            adjusted_weight *= self.keep_mask

        self.output = np.dot(self.input, adjusted_weight) + self.b
        return self.output

    def backward_pass(self, backprop_params):
        if not self.dropout_on:
            warnings.warn("Warning: Backprop is being run without dropout, which is probably an error.")
        alpha_adj, _ = backprop_params   # Not using L2 regularization

        adjusted_weight = self.w.copy()

        if self.dropout_on:
            adjusted_weight *= self.keep_mask

        self.input_side_deltas = np.dot(self.output_side_deltas, adjusted_weight.T)

        weight_delta = alpha_adj * np.dot(self.input.T, self.output_side_deltas)

        if self.dropout_on:
            weight_delta *= self.keep_mask

        self.w += weight_delta
        self.b += alpha_adj * self.output_side_deltas.sum(axis=0)

        return self.input_side_deltas


In [12]:
class ConvolutionLayer(Layer):

    def __init__(self,channels_out,channels_in,window_size,stride,pad=False,relu=True):
        super().__init__()
        self.channels_out=channels_out # output의 depth
        self.channels_in=channels_in # input의 depth
        self.window_size=window_size 
        self.stride=stride
        self.pad=pad
        
        self.shape_4d=(channels_out,channels_in,window_size,window_size)
        # filter 정의
        # l :  channels_in*window_size**2
        # W :  channels_out ?? filter 개수
        self.filter_2d=initialize_weight_array(channels_in*window_size**2,channels_out,relu=relu)
        self.filter_4d=self.filter_2d.reshape(self.shape_4d)
        self.b=np.zeros(shape=(1,channels_out))
        
        self.batch_size=None
        self.padded_input=None
        self.top_pad = None
        self.bottom_pad = None
        self.left_pad = None
        self.right_pad = None
        self.reshaped_input = None
        self.output_height = None
        self.output_width = None
    
    def forward_pass(self):
        if self.pad:
            # pad 값 연산
            # window_size에 따라 변화한다. 하지만 다르게 줄 수도 있다. default 값 설정하기
            # cs231n 노트 참고 : http://cs231n.github.io/convolutional-networks/
            _,_,input_h,input_w=self.input.shape
            self.top_pad=(self.window_size-1)//2 # // operation : 몫
            self.bottom_pad=self.window_size//2-(input_h-1)%self.stride
            self.left_pad=(self.window_size-1)//2
            self.right_pad=(self.window_size//2)-(input_w-1)%self.stride
            self.padded_input=pad_image(self.input,self.top_pad,self.bottom_pad,self.left_pad,self.right_pad)
            
        else : # pad값이 안들어오면, padded된 것이라고 간주
            self.padded_input=self.input
        
        
        
        self.reshaped_input=self.img_batch_to_conv_stacks() # 이해하기 
        self.batch_size=self.input.shape[0]
        
        reshaped_output=np.dot(self.reshaped_input,self.filter_2d)+self.b # 이해해보기
        self.output=reshaped_output.T.reshape(self.channels_out, self.batch_size,self.output_height,self.output_width).transpose(1,0,2,3)
        
        return self.output
    
    def backward_pass(self,backprop_params):
        alpha_adj, lam= backprop_params
        
        # self.output_side_deltas : ?? 어디서 나온것인지 확인하기
        # transpose : 행렬의 axis의 이동
        reshaped_output_side_deltas=self.output_side_deltas.transpose(1,0,2,3).reshape(self.channels_out,-1).T
        
        reshaped_input_side_deltas=np.dot(reshaped_output_side_deltas, self.filter_2d.T)
        
        self.input_side_deltas=self.conv_stack_deltas_to_input_deltas(reshaped_input_side_deltas)
        
        if self.pad:
            new_bottom_index=self.input_side_deltas.shape[2]-self.bottom_pad
            new_right_index=self.input_side_deltas.shape[3]-self.right_pad
            self.input_side_deltas=self.input_side_deltas[:,:,self.top_pad:new_bottom_index,\
                                                         self.left_pad:new_right_index]
            
        self.filter_2d+=alpha_adj*np.dot(self.reshaped_input.T,reshaped_output_side_deltas)
        self.filter_4d=self.filter_2d.T.reshape(self.shape_4d)
        
        self.b+=alpha_adj*self.output_side_deltas.sum(axis=(0,2,3))
        
        
            
        if lam:
            self.filter_2d*=(1.0-lam*alpha_adj)
            self.filter_4d*=(1.0-lam*alpha_adj)
            
        return self.input_side_deltas
    
    # input_data를 filter와 inner product할 수 있게 바꿔준다.
    # cnn의 중요한 부분
    def img_batch_to_conv_stacks(self):
        """
        Takes the current input, a batch of images with depth, and sets the reshape_input property to be series
        of convolutional stacks obtained by passing a square prism window with matching depth across each image
        (left to right along the top, then next row down, etc, then same for remaining channels, then next image).
        Each window prism is unrolled into a single 1-D row, and the stack array has dimensions
        (batch size * number_of_windows) by (window_size^2 * depth).
        """
        batch_size, img_depth, img_height,img_weight=self.padded_input.shape
        unrolled_window_size=self.window_size**2*img_depth
        
        # output의 width, height 정의, depth는 filter의 개수
        self.output_height=(img_height-self.window_size)//self.stride+1
        self.output_width=(img_weight-self.window_size)//self.stride+1
        
        conv_stack=[]
        
        for k in range(0, batch_size): # batch끼리
            # height기준으로, img_height-self.window_size+1 범위에서 ,stride만큼 이동 
            for i in range(0, img_height-self.window_size+1,self.stride): 
                # weight기준으로, img_weight-self.window_size+1 범위에서 ,stride만큼 이동
                for j in range(0, img_weight-self.window_size+1,self.stride): 
                    conv_stack.append(self.padded_input[k,:,i:i+self.window_size,j:j+self.window_size]\
                                     .reshape(unrolled_window_size)) # 내적을 하기 위한 전처리
                    
        # conv_stack shape가 어떻게 되는지 확인하기
                    
        return np.array(conv_stack)
        
    def conv_stack_deltas_to_input_deltas(self, reshaped_input_side_deltas):
        reshaped_input_side_deltas=reshaped_input_side_deltas.reshape(self.batch_size, self.output_height, self.output_width,-1)
        deconvolved_input_side_deltas=np.zeros(self.padded_input.shape)
        
        for k in range(self.batch_size):
            for i in range(self.output_height):
                for j in range(self.output_width):
                    patch_to_add=reshaped_input_side_deltas[k][i][j].\
                    reshape(self.channels_in,self.window_size,self.window_size)
                    
                    in_side_i=self.stride*i
                    in_side_j=self.stride*j
                    deconvolved_input_side_deltas[0,0:self.channels_in,in_side_i:in_side_i+self.window_size,\
                                                 in_side_j:in_side_j+self.window_size]+=patch_to_add
                    
                    
                    

        return deconvolved_input_side_deltas
    

In [13]:
class ConvolutionFullyConnectedBridge(Layer):
    """
    Layer that connects a 4-D (batch_size, conv_output_channels, conv_output_height, conv_output_width) input to a
    2-D output (batch_size, conv_output_channels * conv_output_height * conv_output_width)
    """

    def __init__(self, conv_output_channels, conv_output_height, conv_output_width):
        super().__init__()
        self.conv_output_channels = conv_output_channels
        self.conv_output_height = conv_output_height
        self.conv_output_width = conv_output_width

        self.batch_size = None

    def forward_pass(self):
        self.batch_size = self.input.shape[0]
        self.output = self.input.reshape(self.batch_size, -1)
        return self.output

    def backward_pass(self, backprop_params):
        _, _ = backprop_params   # Ignoring backprop params, since there's nothing to train

        self.input_side_deltas = self.output_side_deltas.reshape(self.batch_size, self.conv_output_channels,
                                                                 self.conv_output_height, self.conv_output_width)
        return self.input_side_deltas


In [14]:
def train_regression_model(regression_net, train, test, alpha, epochs, lam=0.0, verbose=False):
    """
    Training tool for regressions--simpler than classification tool, currently not using
    validation or batches.
    """
    print("Training network with alpha={}, lambda={} for {} epochs...".format(alpha, lam, epochs))
    x_training, x_testing = train['x'], test['x']
    y_training, y_testing = train['y_'], test['y_']

    training_size = x_training.shape[0]

    for e in range(1, epochs + 1):
        delta_y = y_training - regression_net.feed_forward(x_training)
        regression_net.feed_backward(delta_y, [alpha / training_size, lam])
        if verbose and e % 100 == 0:
            print("Epoch {:>3}\t Training loss: {:>5.3f}".format
                  (e, regression_net.mse_cost(y_predicted=regression_net.layers[-1].output, y_actual=y_training)))
    print("Training complete. Testing loss: {:>5.3f}".format
          (regression_net.mse_cost(y_predicted=regression_net.feed_forward(x_testing), y_actual=y_testing)))


def train_classifier_model(classifier, train, valid, test, alpha, batch_size, epochs,
                           lam=0.0, dropout_model=False, verbose=False):

    print("Training network with alpha={}, lambda={}, batch size={} for {} epochs...".format(
          alpha, lam, batch_size, epochs))

    x_training, x_validation, x_testing = train['x'], valid['x'], test['x']
    y_training_int, y_validation_int, y_testing_int = train['y_as_int'], valid['y_as_int'], test['y_as_int']
    y_training_one_hot, y_validation_one_hot, y_testing_one_hot = train['y_'], valid['y_'], test['y_']

    num_batches = len(x_training) // batch_size

    for e in range(1, epochs + 1):
        training_loss = 0.0
        for j in range(num_batches):
            start_index = j * batch_size
            end_index = start_index + batch_size
            x = x_training[start_index: end_index]
            y_ = y_training_one_hot[start_index: end_index]

            if dropout_model:
                set_dropout_boolean(classifier, True)

            delta_y = y_ - classifier.feed_forward(x) # delta_y == self.output_side_deltas
            classifier.feed_backward(delta_y, (alpha / num_batches, lam))

            if dropout_model:
                set_dropout_boolean(classifier, False)

            training_loss += classifier.cross_entropy_cost(y_predicted=classifier.layers[-1].output, y_actual=y_)

        if verbose and e % 1 == 0:
            print("Epoch {:>3}\t Training loss: {:>5.3f}\t Validation acc: {:>5.3f}".format
                  (e, training_loss / num_batches, classifier.accuracy(x_validation, y_validation_int)))

    print("Training complete. Testing loss: {:>5.3f} \t Testing accuracy: {:>5.3f}".format
          (classifier.cross_entropy_cost(y_predicted=classifier.feed_forward(x_testing), y_actual=y_testing_one_hot),
           classifier.accuracy(x_testing, y_testing_int)))


In [15]:
def make_classifier(layer_sizes):
    """
    Returns a classifier object with the specified fully connected layer sizes.
    Each fully connected layer except for the last is followed by a sigmoid
    activation layer. Last fully connected layer is followed by a softmax layer.
    For an MNIST network layer sizes might be something like [784, 150, 25, 10].
    """
    layers = []
    for i in range(len(layer_sizes) - 2):
        layers.append(FullyConnectedLayer(layer_sizes[i], layer_sizes[i + 1]))
        layers.append(SigmoidLayer())
    layers.append(FullyConnectedLayer(layer_sizes[-2], layer_sizes[-1]))
    layers.append(SoftmaxLayer())
    return Classifier(layers)


def make_lrelu_classifier(layer_sizes):
    """
    Returns a classifier object with the specified fully connected layer sizes.
    Each fully connected layer except for the last is followed by a LReLU
    activation layer. Last fully connected layer is followed by a softmax layer.
    For an MNIST network layer sizes might be something like [784, 150, 25, 10].
    """
    layers = []
    for i in range(len(layer_sizes) - 2):
        layers.append(FullyConnectedLayer(layer_sizes[i], layer_sizes[i + 1], relu=True))
        layers.append(LReLULayer())
    layers.append(FullyConnectedLayer(layer_sizes[-2], layer_sizes[-1]))
    layers.append(SoftmaxLayer())
    return Classifier(layers)


def make_lrelu_classifier_with_dropout(layer_sizes, keep_prob):
    """
    Returns a classifier object with the specified fully connected layer sizes.
    Each fully connected layer except for the last is followed by a LReLU
    activation layer. Last fully connected layer is followed by a softmax layer.
    For an MNIST network layer sizes might be something like [784, 150, 25, 10].
    """
    layers = []
    for i in range(len(layer_sizes) - 2):
        layers.append(FullyConnectedLayerWithDropout(layer_sizes[i], layer_sizes[i + 1], keep_prob, relu=True))
        layers.append(LReLULayer())
    layers.append(FullyConnectedLayerWithDropout(layer_sizes[-2], layer_sizes[-1], keep_prob))
    layers.append(SoftmaxLayer())
    return Classifier(layers)


def set_dropout_boolean(network, dropout_boolean):
    for layer in network.layers:
        if type(layer).__name__ == 'FullyConnectedLayerWithDropout':
            layer.dropout_on = dropout_boolean


def old_make_cnn_classifier():
    """
    Draft of a CNN classifier
    """
    layers = [ConvolutionLayer(channels_out=32, channels_in=1, window_size=5, stride=2, pad=True),
              LReLULayer(),
              ConvolutionLayer(channels_out=64, channels_in=32, window_size=5, stride=2, pad=True),
              LReLULayer(),
              ConvolutionFullyConnectedBridge(64, 7, 7),
              FullyConnectedLayerWithDropout(3136, 180, keep_prob=0.5),
              LReLULayer(),
              FullyConnectedLayerWithDropout(180, 10, keep_prob=0.5),
              SoftmaxLayer()]
    print("""[ConvolutionLayer(channels_out=32, channels_in=1, window_size=5, stride=2, pad=True),
              LReLULayer(),
              ConvolutionLayer(channels_out=64, channels_in=32, window_size=5, stride=2, pad=True),
              LReLULayer(),
              ConvolutionFullyConnectedBridge(64, 7, 7),
              FullyConnectedLayerWithDropout(3136, 180, keep_prob=0.5),
              LReLULayer(),
              FullyConnectedLayerWithDropout(180, 10, keep_prob=0.5),
              SoftmaxLayer()]""")
    return Classifier(layers)

In [16]:
def make_cnn_classifier(input_shape, conv_layer_list, fully_connected_layer_list=None, keep_prob=0.5):
    """
    Function to create CNN classifier, given the following parameters:
    input_shape: tuple of (depth, height, width) which is (1, 28, 28) for MNIST
    conv_layer_list: list of layer parameters in the form (window_size, stride, out_channels, padded_boolean)
    fully_connected_layer_list: list of optional additional fully connected layer sizes, where a list [1000, 200]
      would result in two additional layers after the convolution layers, for a total of three layers. These would be
      ? to 1000, 1000 to 200, and 200 to 10, where the ? is determined by the output of the final convolutional layer.
    keep_prob: the keep probability for the dropout in the fully connected layers.
    """

    if fully_connected_layer_list is None:
        fully_connected_layer_list = []

    current_depth, current_height, current_width = input_shape
    layers = []

    print("Creating CNN with the following layers:")

    for win_size, stride, out_chan, pad_bool in conv_layer_list:
        layers.append(ConvolutionLayer(channels_out=out_chan, channels_in=current_depth, window_size=win_size,
                                       stride=stride, pad=pad_bool))
        print("\tConvolution layer. Window size: ({}, {})\tStride: ({}, {})\tOutput channels: {:3}\tPadding: {}".format(
              win_size, win_size, stride, stride, out_chan, pad_bool))

        layers.append(LReLULayer())
        current_depth = out_chan
        if pad_bool:
            current_height += win_size - 1 - (current_height - 1) % stride
            current_width += win_size - 1 - (current_width - 1) % stride
        current_height = (current_height - win_size) // stride + 1
        current_width = (current_width - win_size) // stride + 1

    layers.append(ConvolutionFullyConnectedBridge(current_depth, current_height, current_width))

    fully_connected_size = current_depth * current_height * current_width

    for layer_size in fully_connected_layer_list:
        layers.append(FullyConnectedLayerWithDropout(fully_connected_size, layer_size, keep_prob=keep_prob))
        print("\tFully connected layer. {} to {} with keep probability {}".format(
              fully_connected_size, layer_size, keep_prob))
        fully_connected_size = layer_size
        layers.append(LReLULayer())

    layers.append(FullyConnectedLayerWithDropout(fully_connected_size, 10, keep_prob=keep_prob))

    print("\tFully connected layer. {} to {} with keep probability {}".format(
          fully_connected_size, 10, keep_prob))

    layers.append(SoftmaxLayer())

    return Classifier(layers)

In [17]:
training, validation, testing = import_and_prepare_mnist_data(0.1, 0.1, flat=False)

In [None]:
conv_layer_parameters = [(5, 2, 64, True), (5, 2, 128, True)]
fully_connected_parameters = [400]

cnn_classifier = make_cnn_classifier((1, 28, 28), conv_layer_parameters, fully_connected_parameters)

print("Classifier created")

train_classifier_model(cnn_classifier, training, validation, testing, alpha=1.0, \
                       batch_size=64,epochs=50, lam=0.01, dropout_model=True, verbose=True)

Creating CNN with the following layers:
	Convolution layer. Window size: (5, 5)	Stride: (2, 2)	Output channels:  64	Padding: True
	Convolution layer. Window size: (5, 5)	Stride: (2, 2)	Output channels: 128	Padding: True
	Fully connected layer. 6272 to 400 with keep probability 0.5
	Fully connected layer. 400 to 10 with keep probability 0.5
Classifier created
Training network with alpha=1.0, lambda=0.01, batch size=64 for 50 epochs...
