<div class="alert alert-block alert-success">
<b>Goal:</b> Train MNIST model by gradient-backpropagation without deep learning framework
</div>

<div class="alert alert-block alert-warning">
<b>Notice:</b> Only uses numpy for array calculation. Tensorflow is just for loading mnist data</div>

In [1]:
import numpy
from tensorflow import keras
# without ONE-HOT encoding or normalization
(x_train, y_train), (x_test,y_test) = keras.datasets.mnist.load_data()

<div class="alert alert-block alert-info">
<b>Global Hyperparameters:</b> Some variables that will remain constant throughout the code</div>

In [2]:
BATCH_SIZE = 128
LEARNING_RATE = 0.05
LEARNING_RATE_DECAY=0.5
NUMBER_OF_EPOACHES = 10

<div class="alert alert-block alert-info">
<b>Layer Classes:</b> Define classes for each layers, so we can edit model freely</div>

In [3]:
class Input_layer:
    def __init__(self, train_x):

        if len(train_x.shape) == 3:
            train_x = train_x.reshape(len(train_x), -1)

        self.VALUE = train_x
        self.SHAPE = train_x.shape
        self.TYPE = 'INPUT'

class Dense_layer:
    def __init__(self, node_num):
        self.NODE_SIZE = node_num

        self.WEIGHT = None
        self.BIAS = None

        self.SHAPE = ((node_num, None), (node_num))
        self.TYPE = 'DENSE'

    def __call__(self, input):
        self.WEIGHT = 0.1 * numpy.random.randn(input.shape[1], self.NODE_SIZE)
        self.BIAS = 0.1 * numpy.random.randn(1,self.NODE_SIZE)

class Activation_layer:
    def __init__(self, function):
        self.function = function
        self.TYPE = 'ACTIVATION'

<div class="alert alert-block alert-info">
<b>Model:</b> The forepropagation and backpropagation signal values for the layers are saved as dict items under corresponding key(layer name)</div>

<div class="alert alert-block alert-danger">
<b>Careful:</b> to keep it simple, there is one important assumption: <b>ONLY ONE SOFTMAX LAYER EXISTS IN ENTIRE LAYER AND IT ALWAYS COMES LAST.</b> The reason is because softmax combined with cross-entropy error results very simple and intuitive backpropagation result.
</div>

In [4]:
class Sequentaial_model:
    def __init__(self, *list_of_layers):

        self.model_layer = {} # layer weights
        self.signal_forward = {} # forepropagation
        self.signal_backward = {} # backpropagation

        self.dense_layer_cnt = 0
        self.activation_cnt = 0
        self.batch_size = BATCH_SIZE
        self.input_layer = None # just a imaginary
        
        for layer in list_of_layers:
            if layer.TYPE == 'INPUT':
                self.model_layer['input_layer'] = layer.VALUE
                self.input_layer = numpy.zeros((self.batch_size, layer.SHAPE[1]))

            elif layer.TYPE == 'DENSE':
                self.dense_layer_cnt += 1
                layer(self.input_layer)
                self.model_layer[f'dense_layer_{self.dense_layer_cnt}'] = [layer.WEIGHT, layer.BIAS]
                self.input_layer = numpy.zeros((self.batch_size, layer.NODE_SIZE))

            elif layer.TYPE == 'ACTIVATION':
                self.activation_cnt += 1
                self.model_layer[f'activation_layer_{layer.function}'] = layer.function

    def predict(self, input):

        self.signal_in = input
        self.signal_depth = 0
        for key in self.model_layer.keys():
            if 'input' in key:
                self.signal_forward[f'{key}'] = input
                self.signal_in = self.signal_forward[f'{key}']
            elif 'dense' in key:
                self.signal_depth += 1
                self.signal_forward[f'{key}'] = numpy.matmul(self.signal_in, self.model_layer[key][0]) + self.model_layer[key][1]
                self.signal_in = self.signal_forward[f'{key}']

            elif 'activation' in key:
                self.signal_depth += 1
                if 'Relu' in key:
                    self.signal_forward[f'{key}'] = numpy.maximum(0, self.signal_in)
                    self.signal_in = self.signal_forward[f'{key}']
                elif 'Softmax' in key:
                    
                    self.signal_in = self.signal_in.T
                    self.signal_in = self.signal_in - numpy.max(self.signal_in, axis=0)
                    y = numpy.exp(self.signal_in) / numpy.sum(numpy.exp(self.signal_in), axis=0)
                    self.signal_forward[f'{key}'] = y.T

                    self.signal_in = self.signal_forward[f'{key}']

        return self.signal_in

    def calc_cross_entropy_error(self,prediction,label):
        assert BATCH_SIZE == len(label)
        loss  = 0
        
        for i in range(BATCH_SIZE):
            loss += 1*numpy.log10(prediction[i][label[i]]+1e-6)

        loss = -1*loss/BATCH_SIZE
        
        return loss

    def back_propagation(self, prediction, label):

        # LETS ASSUME THE ERROR IS CROSS_ENTROPY_ERROR
        # AND THE LAST ACTIVATION IS ALWAYS SOFTMAX LAYER

            # ONE-HOT
        t = numpy.zeros_like(prediction)
        for row in range(BATCH_SIZE):
            t[row][label[row]] = 1

        y = numpy.array(prediction - t)

        # we will neglect the last soft-max layer by calling layers except the last layer
        for layer_lv in range(len(self.model_layer.keys())-2,-1,-1):
            current_layer = list(self.model_layer.keys())[layer_lv]

            if 'input' in current_layer:
                # INPUT IS NOT UPDATED BY GRADIENT
                pass

            elif 'dense' in current_layer:
                self.signal_backward[current_layer] = [[],[]] # one for weight, the other for bias

                upper_layer = list(self.model_layer.keys())[layer_lv-1]
                
                x = self.signal_forward[upper_layer]
                
                dy = numpy.matmul(numpy.transpose(x), y)

                self.signal_backward[current_layer][0] = dy
                self.signal_backward[current_layer][1] = numpy.sum(y,0)
                self.signal_backward[current_layer][1] = numpy.expand_dims(self.signal_backward[current_layer][1], axis=0)

                y  = numpy.matmul(y,numpy.transpose(self.model_layer[current_layer][0]))
                
            elif 'activation' in current_layer:
                if 'Relu' in current_layer:
                    self.signal_backward[current_layer] = []

                    upper_layer = list(self.model_layer.keys())[layer_lv-1]

                    x = self.signal_forward[upper_layer]
                    _x = numpy.zeros_like(x)

                    dy = y*numpy.not_equal(x,_x)
                   
                    self.signal_backward[current_layer] = dy

                    y = dy

    def update_gradient(self, lr=LEARNING_RATE):
        
        for key in self.model_layer.keys():
            if 'dense' in key:
                self.model_layer[key][0] = self.model_layer[key][0] - lr*self.signal_backward[key][0] # weight update
                self.model_layer[key][1] = self.model_layer[key][1] - lr*self.signal_backward[key][1] # bias update
            
    def fit(self, train_x, train_y, test_x, test_y, batch_size, n_epoach):

        # FLATTEN
        train_x = train_x.reshape(len(train_x),-1)
        test_x = test_x.reshape(len(test_x),-1)

        iteration = len(train_x)//batch_size
        test_iteration = len(test_x)//batch_size
        
        test_set_sample_index = numpy.random.randint(len(test_x))
        
        for nth_epoach in range(n_epoach):
            loss_list = list()
            test_acc = list()
            print(f"------------------------ epoch : {nth_epoach+1} --------------------------------",end="\n")
            for nth_iteration in range(iteration):
                print(f"iteration : {nth_iteration}/{iteration}",end='\r')
                #normalization and batch split
                train_x_sub = train_x[nth_iteration*batch_size:(nth_iteration+1)*batch_size]/255.0
                train_y_sub = train_y[nth_iteration*batch_size:(nth_iteration+1)*batch_size]
                loss_list.append([self.calc_cross_entropy_error(self.predict(train_x_sub), train_y_sub)])
                self.back_propagation(self.predict(train_x_sub),train_y_sub)
                self.update_gradient(lr=LEARNING_RATE*LEARNING_RATE_DECAY**nth_epoach)
            
            for nth_t_iteration in range(test_iteration):
                test_x_sub = test_x[nth_t_iteration*batch_size:(nth_t_iteration+1)*batch_size]/255.0
                test_y_sub = test_y[nth_t_iteration*batch_size:(nth_t_iteration+1)*batch_size]
                test_acc.append([self.calc_acc(test_x_sub, test_y_sub)])
                
            print(f"training_loss : {round(numpy.mean(loss_list),4)}")
            print(f"validation_acc : {round(numpy.mean(test_acc),4)}",end='\n')

    def calc_acc(self,valid_x, valid_y):
        y= self.predict(valid_x)
        
        prediction = numpy.argmax(y,1)
        mask = numpy.ones_like(prediction)
        truth = numpy.equal(prediction,valid_y)
        
        acc = numpy.sum((mask*truth),0)/BATCH_SIZE
        # print("y : ", y)
        # print("prediction : ", prediction)
        # print("answer : ", valid_y)
        # print("check: ", mask*truth)
        # assert False
        return acc

<div class="alert alert-block alert-info">
<b>Build:</b> The depth and the size of dense layers and relu layers can be modified. Just make sure Input_layer comes first, and the ONLY softmax layer comes last</div>

In [5]:
model = Sequentaial_model(
    Input_layer(x_train),
    Dense_layer(128),
    Activation_layer('Relu'),
    Dense_layer(10),
    Activation_layer('Softmax')
)

<div class="alert alert-block alert-info">
<b>Tip:</b> There is no appropriate weight initialization, batch normalization nor gradient optimization in this model. Try different variables and layer combinations to necessity of these important features that are mentioned.(ex: large learning rate, small node size)</div>

In [6]:
model.fit(x_train, y_train, x_test,y_test, batch_size=BATCH_SIZE, n_epoach=NUMBER_OF_EPOACHES)

------------------------ epoch : 1 --------------------------------
training_loss : 3.9805
validation_acc : 0.1032
------------------------ epoch : 2 --------------------------------
training_loss : 2.3923
validation_acc : 0.4111
------------------------ epoch : 3 --------------------------------
training_loss : 1.5182
validation_acc : 0.6773
------------------------ epoch : 4 --------------------------------
training_loss : 1.8048
validation_acc : 0.6461
------------------------ epoch : 5 --------------------------------
training_loss : 1.476
validation_acc : 0.7744
------------------------ epoch : 6 --------------------------------
training_loss : 1.4574
validation_acc : 0.7778
------------------------ epoch : 7 --------------------------------
training_loss : 1.4142
validation_acc : 0.7778
------------------------ epoch : 8 --------------------------------
training_loss : 1.4062
validation_acc : 0.7911
------------------------ epoch : 9 --------------------------------
training_loss