## Multilayer perceptron. Stochastic gradient descent


In [2]:
from mnist import MNIST
import random

mndata = MNIST('C:\\Users\\ivan_\\PycharmProjects\\optimization\\samples')

images, labels = mndata.load_testing()
print(len(images))

10000


### Класс перцептрона
Вход            -   столбец $ W_{in} \;[ 784 \times 1] $, $ 784 = 28 \times 28 $ -- размер исходного изображения  
Первый слой     -   матрица $ W_{1} \; [16 \times 784] $  
Второй слой     -   матрица $ W_{2} \; [10 \times 16] $  
Выходной слой   -   столбец $ W_{out} \; [10 \times 1] $  
В выходном слое в элементе $W_{out}[i], i \in [0,9]$ содержится оценка вероятности того, что на вход подано изображение числа $i$  
В качестве функции активации используется сигмоида $\sigma (x) = \frac{1}{1 + e^{-x}}$  
Для нормализации выходных значений используется softmax-преобразование: $p[i] = \frac{e^{y[i]}}{T}, T = \sum\limits_{k = 0}^{9}{e^{y[k]}}, i \in [0,9]$

In [None]:
class Perceptron:
    w_in = []   # vector with input data -- pixels of input picture represented as a vector
    w_out = []  # result -- vector with probabilities of events. Event i is numbers i in [0-9] is on the picture
    ans = -1    # correct answer in [0-9]
    
    layers = [] # [w_1, b_2, w_2, b_2]
    w_1 = [][]  # matrix with coeffs of 1st hidden layer 
    b_1 = [][]  
    w_2 = [][]  # matrix with coeffs of 2nd hidden layer
    b_2 = [][]
    
    w_in_dim = 0  # size of input vector
    w_1_dim = 0   # number of rows in 1st hidden layer
    w_out_dim = 0 # size of output vector
    
    # Matrix sizes
    
    # w_in:  [w_in_dim  x 1        ]
    # w_1:   [w_1_dim   x w_in_dim ]
    # b_1:   [w_1_dim   x 1        ]
    # w_2:   [w_out_dim x w_1_dim  ]
    # b_2:   [w_out_dim x 1        ]
    # w_out: [w_out_dim x 1        ]
    
    derivatives = [] # [w_1_derivative, b_2_derivative, w_2_derivative, b_2_derivative]
    w_1_derivative = [][] # matrix with partial derivatives of parameters in 1st hidden layer
    b_1_derivative = [][]
    w_2_derivative = [][] # matrix with partial derivatives of parameters in 2nd hidden layer
    b_2_derivative = [][]
    
    test_num = 0  # 0 in the beginning of learning. Increases when picture is processed
    epoch_num = 0 # 0 in the beginning of learning. Increases when whole battery of test pictures is processed
    
    error = [] # array of errors of all the tests
    
    act_func = lambda x: 1 / (1 + np.exp(-x)) # current activation function
    

    def __init__(self, w_in_dim, w_1_dim, w_out_dim):
        self.w_in_dim = w_in_dim
        self.w_1_dim = w_1_dim
        self.w_out_dim = w_out_dim
        
        self.w_in = np.ones(w_in_dim)
        self.w_out = np.zeros(w_out_dim)
        
        self.w_1 = np.ones((w_1_dim, w_in_dim))
        self.b_1 = np.ones(w_1_dim)
        self.w_2 = np.ones((w_out_dim, w_1_dim))
        self.b_2 = np.ones(w_out_dim)
        self.layers = [self.w_1, self.b_2, self.w_2, self.b_2]
        
        self.w_1_derivative = np.ones((w_1_dim, w_in_dim))
        self.b_1_derivative = np.ones(w_1_dim)
        self.w_2_derivative = np.ones((w_out_dim, w_1_dim))
        self.b_2_derivative = np.ones(w_out_dim)
        self.derivatives = [self.w_1_derivative, self.b_2_derivative, self.w_2_derivative, self.b_2_derivative]
        
    def get_activation_func(self):
        return self.act_func

    
    def set_activation_func(self, func):
        self.act_func = func
    
    
    def softmax(self):
        self.w_out = np.exp(self.w_out) / np.sum(np.exp(self.w_out))
    
    
    def calc_result(self):
        self.w_out = self.w_2 @ self.activation_func(self.w_1 @ self.w_in)
        self.softmax()
    
    
    def teach_with_picture(self, w_in, ans):
        self.w_in = w_in
        self.ans = ans
        
        # auxilary vectors
        M = w_1 @ w_in + b_1
        N = w_2 @ self.act_func(M).T + b_2
        N_stroke = w_2 * np.tile((np.ones(w_1_dim) + np.exp(-M)) ** -2 * np.exp(-M), w_out_dim).reshape(w_out_dim, w_1_dim)
        
        exp_sum = np.sum(N)
        indicators = np.zeros(w_out_dim)
        indicators[self.ans] = 1
                
        b_2_derivative = (N - indicators) * np.exp(N) * (np.ones(w_out_dim) / exp_sum - np.exp(N) / exp_sum ** 2)
        w_2_derivative = b_2_derivative.reshape(w_out_dim, 1) * self.act_func(M).reshape(1, w_1_dim)
        
        b_1_derivative = np.zeros(w_1_dim)
        for k in range(w_out_dim):
            b_1_derivative += ((w_out[k] - indicators[k]) * np.exp(N[k]) * np.exp(N).reshape(1, w_out_dim) @ (
                np.tile(N_stroke[k], w_out_dim).reshape(w_out_dim, w_1_dim) - N_stroke))[0]
        w_1_derivative = b_1_derivative.reshape(w_1_dim, 1) @ w_in.reshape(1, w_in_dim)


    def get_derivative(self, matrix_num, i, j):
        return self.layers[matrix_num][i][j]
    
    
    def update_weights(self):
        for matrix_num in range(len(self.layers):
            for i in range(len(self.layers[k]):
                for j in range(len(self.layers[k][0]):
                    self.layers[k][i][j] += self.eta * get_derivative(matrix_num, i, j)