# backpropagation

## MulLayer

In [1]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
        
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x*y
        
        return out
    
    def backward(self, dout):
        dx = dout * self.y
        dy = dout * self.x
        
        return dx, dy

## Buy Apple

In [3]:
apple = 100
apple_num = 2
tax = 1.1

# Layers
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# Forward
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

print(apple_price)
print(price)

200
220.00000000000003


In [5]:
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(dapple, dapple_num, dtax)

2.2 110.00000000000001 200


## AddLayer

In [9]:
class AddLayer:
    def __init__(self):
        pass

    def forward(self, x, y):
        out = x + y
        return out

    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy

## Buy Apple Orange

In [12]:
apple_num = 2
apple = 100
orange = 150
orange_num = 3
tax = 1.1

# Layers
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
mul_tax_layer = MulLayer()
add_fruit_layer = AddLayer()

# Forward
apple_price = mul_apple_layer.forward(apple_num, apple)
orange_price = mul_orange_layer.forward(orange_num, orange)
fruit_price = mul_fruit_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(fruit_price, tax)

dprice = 1
# Backward
dfruit_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_fruit_layer.backward(dfruit_price)
dapple_num, dapple = mul_apple_layer.backward(dapple_price)
dorange_num, dorange = mul_orange_layer.backward(dorange_price)

print(price)
print(dapple_num, dapple, dorange, dorange_num, dtax)
xt

715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


## Activation Function Layer Implementation

### ReLu

In [13]:
class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0

        return dout

### Sigmoid

In [14]:
import numpy as np

class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        self.out = 1 / (1 + np.exp(-x))
        return self.out

    def backward(self, dout):
        return dout * self.out * (1-self.out)

## Affine

In [15]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self, x.T, dout)
        self.db = np.sum(dout, axis=0)

        return dx

## Softmax-with-Loss

In [16]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size

        return dx

## TwoLayerNet Implementation

In [2]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

In [5]:
class TwoLayerNet:
    
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # initialize weight
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(input_size, hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(hidden_size, output_size)
        
        # initialize layers
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Reulu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
            
        return x
    
    def loss(self, x, y_hat):
        y = self.predict(x)
        loss = self.lastLayer.forward(y, y_hat)
        
    def accuracy(self, x, y_hat):
        y = self.predict(x)
        y = argmax(y, axis=1)
        if y_hat.ndim != 1:
            y_hat = np.argmax(y_hat, axis=1)
            
        accuracy = np.sum(y == y_hat)/len(y)
        
        return accuracy
    
    def numerical_gradient(self, x, y_hat):
        loss_W = lambda W: self.loss(x. y_hat)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    def gradient(self, x, y_hat):
        
        # forward
        self.loss(x, y_hat)
        
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
            
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

## Gradient Check

In [9]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet

# read data
(x_train, y_train), (x_test, y_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
y_batch = y_train[:3]

grad_numerical = network.numerical_gradient(x_batch, y_batch)
grad_backprop = network.gradient(x_batch, y_batch)

for layer in grad_numerical.keys():
    diff = np.average(abs(grad_backprop[layer]-grad_numerical[layer]))
    print(f'{layer}: {str(diff)}')

W1: 4.705272852647447e-10
b1: 3.1105161162287097e-09
W2: 5.765430520393014e-09
b2: 1.3952254441923495e-07


## Training Implementation

In [12]:
# read data -> 

# select some data from the dataset using batch_mask(np.random.choice(train.shape[0], batch_size) ->
# train the batched data -> 

# forward pass: forward pass -> use softmax to get the softed values(normalized values) for each class which is a final prediction ->
# get the loss between the final prediction and y_train value

# backward pass: differentiate the prediction and the label value to get the loss of the last layer -> 
# pass the output of the differentiation backward (multiply the derivative to the derivative of the previous layer?)

# update the parameters (weights and biases) of the each node by using learning rate

import sys, os
sys.path.append(os.pardir)
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet

(x_train, y_train), (x_test, y_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    y_batch = y_train[batch_mask]
    
    grad = network.gradient(x_batch, y_batch)
    
    for param in ('W1', 'b1', 'W2', 'b2'):
        network.params[param] -= learning_rate * grad[param]
        
    loss = network.loss(x_batch, y_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, y_train)
        test_acc = network.accuracy(x_test, y_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.12288333333333333 0.1245
0.9034833333333333 0.9065
0.9237666666666666 0.9289
0.9362333333333334 0.9376
0.9453166666666667 0.9432
0.95185 0.9502
0.9569333333333333 0.9533
0.9600166666666666 0.9568
0.96505 0.9596
0.9666833333333333 0.9612
0.9692833333333334 0.9646
0.97195 0.9658
0.9740666666666666 0.9672
0.97575 0.9687
0.9744833333333334 0.9676
0.9775333333333334 0.9684
0.9786 0.9705
