## Реализация собственного нейросетевого пакета для запуска и обучения нейронных сетей

Содержиние:
1. Реализация прямого вывода нейронной сети
2. Реализация градиентов по входу и распространения градиента по сети
3. Реализация градиентов по параметрам и метода обратного распространения ошибки с обновлением парметров сети

В дальнейшем ланируется реализация обучения сети со свёрточными слоями, с транспонированной свёрткой, дополнительного оптимизатора.

###  1. Реализация вывода собственной нейронной сети

1.1 Любой слой содержит как минимум три метода:
- конструктор
- прямой вывод
- обратный вывод, производные по входу и по параметрам

In [None]:
class Layer(object):
    def __init__(self):
        self.name = 'Layer'
    def forward(self, input_data):
        pass
    def backward(self, input_data):
        return [self.grad_x(input_data), self.grad_param(input_data)]

    def grad_x(self, input_data):
        pass
    def grad_param(self, input_data):
        return []

    def update_param(self, grads, learning_rate):
        pass


1.2 Ниже предствален интерфейс класса  Network.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

class Network(object):
    def __init__(self, layers, loss=None):
        self.name = 'Network'
        self.layers = layers
        self.loss = loss

    def forward(self, input_data):
        return self.predict(input_data)

    def grad_x(self, input_data, labels):
        out_tup = []
        curr_data = input_data
        for layer in self.layers:
            curr_grad = layer.grad_x(curr_data)
            curr_data = layer.forward(curr_data)
            # print(curr_grad.shape)
            out_tup.append(curr_grad)
        loss_grad = self.loss.grad_x(curr_data,labels)
        out_tup.append(loss_grad)
        tup = []
        tup.append(np.transpose(out_tup[0],(0,2,1)))
        for i in range(1,len(out_tup)):
          tmp = []
          for j in range(out_tup[i].shape[0]):
            tmp.append(np.dot(tup[-1][j],np.transpose(out_tup[i][j])))
          tup.append(np.array(tmp))
        return tup[-1]
    def grad_param(self, input_data, labels):
        grads = []
        outs = []
        curr_data = input_data
        outs.append(curr_data)
        for layer in self.layers:
            grads.append(layer.grad_x(curr_data))
            curr_data = layer.forward(curr_data)
            outs.append(curr_data)
        return grads, outs

    def update(self, grad_list, learning_rate):
      for i in range(len(self.layers)):
        self.layers[i].update_param(grad_list[i], learning_rate)

    def predict(self, input_data):
        current_input = input_data
        for layer in self.layers:
            current_input = layer.forward(current_input)
        return current_input

    def calculate_loss(self, input_data, labels):
        return self.loss.forward(self.predict(input_data), labels)

    def train_step(self, input_data, labels, learning_rate=0.001):
        batch = input_data.shape[0]
        grads, outs = self.grad_param(input_data, labels)
        # print(len(grads), len(outs), len(self.layers))
        loss_grad = self.loss.grad_x(outs[-1], labels)
        thru_grads = []

        thru_grad = loss_grad
        for i in reversed(range(len(self.layers))):
            if self.layers[i].name == 'Dense':
              params_grad = self.layers[i].grad_param(outs[i])
              full_params = []
              tmp_W = []
              tmp_b = []
              for j in range(batch):
                tmp_W.append(np.dot(thru_grad[j], params_grad[0][j]))
                tmp_b.append(thru_grad[j])
              full_params.append(np.array(tmp_W))
              full_params.append(np.array(tmp_b))
              self.layers[i].update_param(full_params, learning_rate)
            tmp_grad = []
            for j in range(batch):
              tmp_grad.append(np.dot(np.transpose(grads[i][j]), thru_grad[j]))
            thru_grad = np.array(tmp_grad)

    def fit(self, trainX, trainY, validation_split=0.25,
            batch_size=1, nb_epoch=1, learning_rate=0.01):

        train_x, val_x, train_y, val_y = train_test_split(trainX, trainY,
                                                          test_size=validation_split,
                                                          random_state=42)
        for epoch in range(nb_epoch):
            #train one epoch
            for i in tqdm(range(int(len(train_x)/batch_size))):
                batch_x = train_x[i*batch_size: (i+1)*batch_size]
                batch_y = train_y[i*batch_size: (i+1)*batch_size]
                self.train_step(batch_x, batch_y, learning_rate)
            #validate
            val_accuracy = self.evaluate(val_x, val_y)
            print('%d epoch: val %.2f' %(epoch+1, val_accuracy))

    def evaluate(self, testX, testY):
        y_pred = np.argmax(self.predict(testX), axis=1)
        y_true = np.argmax(testY, axis=1)
        val_accuracy = np.sum((y_pred == y_true))/(len(y_true))
        return val_accuracy

#### 1.1 Реализация метода forward для вычисления следующих слоёв:

- DenseLayer
- ReLU
- Softmax
- FlattenLayer
- MaxPooling

In [None]:
import numpy as np

In [None]:
class DenseLayer(Layer):
    def __init__(self, input_dim, output_dim, W_init=None, b_init=None):
        self.name = 'Dense'
        self.input_dim = input_dim
        self.output_dim = output_dim
        if W_init is None or b_init is None:
            self.W = np.random.random((input_dim, output_dim))
            self.b = np.zeros(output_dim, 'float32')
        else:
            self.W = W_init
            self.b = b_init
    def forward(self, input_data):
        shape = input_data.shape
        out = np.zeros((shape[0],self.output_dim))
        out = np.dot(input_data,self.W) + self.b
        return out
    def grad_x(self, input_data):
        out = np.zeros((input_data.shape[0], self.output_dim, self.input_dim))
        for i in range(input_data.shape[0]):
          out[i] = np.transpose(self.W)
        return out
    def grad_b(self, input_data):
        out = np.zeros((input_data.shape[0],self.b.shape[0],self.b.shape[0]))
        for i in range(input_data.shape[0]):
          out[i] = np.eye(self.b.shape[0])
        return out
    def grad_W(self, input_data):
        third = self.input_dim*self.output_dim
        # print((input_data.shape[0],self.output_dim,third))
        out = np.zeros((input_data.shape[0],self.output_dim,third))
        for i in range(input_data.shape[0]):
          for j in range(self.output_dim):
            tmp = []
            tmp += [0] * j
            data = input_data[i].tolist()
            for k in range(len(data)-1):
              tmp += [data[k]]
              tmp += [0] * (self.output_dim-1)
            tmp += [data[-1]]
            tmp += [0] * (self.output_dim-j-1)
            out[i,j] = np.array(tmp)
        return out

    def update_W(self, grad, learning_rate):
        self.W -= learning_rate * np.mean(grad, axis=0).reshape(self.W.shape)

    def update_b(self, grad,  learning_rate):
        self.b -= learning_rate * np.mean(grad, axis=0)

    def update_param(self, params_grad, learning_rate):
        self.update_W(params_grad[0], learning_rate)
        self.update_b(params_grad[1], learning_rate)

    def grad_param(self, input_data):
        return [self.grad_W(input_data), self.grad_b(input_data)]

class ReLU(Layer):
    def __init__(self):
        self.name = 'ReLU'
    def forward(self, input_data):
        return np.maximum(0, input_data)
    def grad_x(self, input_data):
        forw = (input_data > 0).astype('float32')
        out = np.zeros((forw.shape[0],forw.shape[1],forw.shape[1]))
        for i in range(forw.shape[0]):
          out[i] = np.diag(forw[i])
        return out


class Softmax(Layer):
    def __init__(self):
        self.name = 'Softmax'
    def forward(self, input_data):
        x = input_data - np.max(input_data, axis=1, keepdims=True)
        x = np.exp(x)
        sum_x = np.sum(x, axis=1, keepdims=True)
        return x/sum_x
    def grad_x(self, input_data):
        y = self.forward(input_data)
        out = np.zeros((y.shape[0],y.shape[1],y.shape[1]))
        for i in range(y.shape[0]):
          tmp = np.array([y[i]])
          out[i] = np.diag(y[i]) - np.dot(np.transpose(tmp),tmp)
        return out



class FlattenLayer(Layer):
    def __init__(self):
        self.name = 'Flatten'

    def forward(self, input_data):
        out = np.array(input_data[:,0,:,:])
        shape = input_data.shape
        return out.reshape(shape[0],-1)
    def grad_x(self, input_data):
        pass

class MaxPooling(Layer):
    def __init__(self):
        self.name = 'MaxPooling'

    def forward(self, input_data):
        old_shape = input_data.shape
        out = np.zeros((old_shape[0], old_shape[1], old_shape[2]//2,old_shape[3]//2))
        for p in range(0, old_shape[0]):
          for g in range(0, old_shape[1]):
            for i in range(0,old_shape[2],2):
              k = i // 2
              for j in range(0,old_shape[3],2):
                l = j // 2
                res = -1000
                res = np.maximum(res, input_data[p][g][i][j])
                res = np.maximum(res, input_data[p][g][i+1][j])
                res = np.maximum(res, input_data[p][g][i][j+1])
                res = np.maximum(res, input_data[p][g][i+1][j+1])

                out[p][g][k][l] = res # np.max(input_data[p][g][i:i+2][j:j+2]) не работает
        return out
        pass
    def grad_x(self, input_data):
        pass

#### 1.2 Реализация свёрточного слоя и транспонированной свёртки (планируется)

In [None]:
class Conv2DLayer(Layer):
    def __init__(self, kernel_size=3, input_channels=2, output_channels=3,
                 padding='same', stride=1, K_init=None, b_init=None):
        # padding: 'same' или 'valid'
        # Работаем с квадратными ядрами, поэтому kernel_size - одно число
        # Работаем с единообразным сдвигом, поэтому stride - одно число
        # Фильтр размерности [kernel_size, kernel_size, input_channels, output_channels]
        self.name = 'Conv2D'
        self.kernel_size = kernel_size
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.kernel = K_init
        self.bias = b_init
        self.padding = padding
        self.stride = stride
        self.kernel = np.random.random((self.kernel_size, self.kernel_size, self.input_channels, self.output_channels))

    def forward(self, input_data):
        # На входе - четырехмерный тензор вида [batch, input_channels, height, width]
        # Вначале нужно проверить на согласование размерностей входных данных и ядра!
        # Нужно заполнить Numpy-тензор out

        # Согласование
        if input_data.shape[1] != self.input_channels and (input_data.shape[2] - self.kernel_size) % self.stride != 0 and (input_data.shape[3]  - self.kernel_size) % self.stride != 0:
          print('NE SOGLASOVANiE!!!')

        batch = input_data.shape[0]
        inch = input_data.shape[1]
        h = input_data.shape[2]
        w = input_data.shape[3]
        kh = self.kernel_size
        kw = self.kernel_size
        pad = 0

        if self.padding == 'valid':
          output_height = (h - self.kernel_size) // self.stride + 1
          output_width = (w - self.kernel_size) // self.stride + 1
        else:
          output_height = h
          output_width = w

        out = np.array((batch, self.output_channels, output_height, output_width))
        # padded_input = np.pad(input_data, pad, mode='constant')

        if self.padding == 'valid':
          for bch in batch:
            for i in range(0, output_height):
                for j in range(0, output_width):
                    out[bch, i, j] = np.sum(np.dot(input_data[bch,:,i*self.stride:i*self.stride+kh,
                                          j*self.stride:j*self.stride+kw], self.kernel))

        return out

    def grad_x(self):
        pass
    def grad_kernel(self):
        pass

In [None]:
class Conv2DTrLayer(Layer):
    def __init__(self, kernel_size=3, input_channels=2, output_channels=3,
                 padding=0, stride=1, K_init=None, b_init=None):
        # padding: число (сколько отрезать от модифицированной входной карты)
        # Работаем с квадратными ядрами, поэтому kernel_size - одно число
        # stride - одно число (коэффициент расширения)
        # Фильтр размерности [kernel_size, kernel_size, input_channels, output_channels]
        self.name = 'Conv2DTr'
        self.kernel_size = kernel_size
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.kernel = K_init
        self.bias = b_init
        self.padding = padding
        self.stride = stride
    def forward(self, input_data):
        # На входе - четырехмерный тензор вида [batch, input_channels, height, width]
        # Вначале нужно проверить на согласование размерностей входных данных и ядра!
        # Нужно заполнить Numpy-тензор out
        out = np.empty([])
        return out

    def forward(self, input_data):
        pass
    def grad_x(self):
        pass
    def grad_kernel(self):
        pass

#### 1.4 Теперь настало время теста.

#### Чтение данных

In [None]:
# Раскомментировать, если установлена старая версия библиотеки
!pip install np_utils

Collecting np_utils
  Downloading np_utils-0.6.0.tar.gz (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: np_utils
  Building wheel for np_utils (setup.py) ... [?25l[?25hdone
  Created wheel for np_utils: filename=np_utils-0.6.0-py3-none-any.whl size=56437 sha256=9b95b78eb29f8a02980cf9f13e54062e013d56b1d55034cbc0f6ef109ed7ce5b
  Stored in directory: /root/.cache/pip/wheels/19/0d/33/eaa4dcda5799bcbb51733c0744970d10edb4b9add4f41beb43
Successfully built np_utils
Installing collected packages: np_utils
Successfully installed np_utils-0.6.0


In [None]:
import numpy as np
np.random.seed(123)  # for reproducibility
import np_utils
from tensorflow.keras.utils import to_categorical
from keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255


Y_train = to_categorical(y_train, 10)
Y_test = to_categorical(y_test, 10)
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
(60000, 1, 28, 28) (60000, 10) (10000, 1, 28, 28) (10000, 10)


#### Подготовка моделей

In [None]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input
from keras.layers import Convolution2D, Conv2D, MaxPooling2D

print(keras.__version__)

def get_keras_model():
    input_image = Input(shape=(1, 28, 28))
    pool1 = MaxPooling2D(pool_size=(2,2), data_format='channels_first')(input_image)
    flatten = Flatten()(pool1)
    dense1 = Dense(10, activation='softmax')(flatten)
    model = Model(inputs=input_image, outputs=dense1)

    from keras.optimizers import Adam, SGD
    sgd = SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])

    history = model.fit(X_train, Y_train, validation_split=0.25,
                        batch_size=32, epochs=2, verbose=1)
    return model

3.8.0


In [None]:
def get_our_model(keras_model):
    maxpool = MaxPooling()
    flatten = FlattenLayer()
    dense = DenseLayer(196, 10, W_init=keras_model.get_weights()[0],
                       b_init=keras_model.get_weights()[1])
    softmax = Softmax()
    net = Network([maxpool, flatten, dense, softmax])
    return net

In [None]:
keras_model = get_keras_model()
our_model = get_our_model(keras_model)

Epoch 1/2
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.7662 - loss: 0.8570 - val_accuracy: 0.8926 - val_loss: 0.3806
Epoch 2/2
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8913 - loss: 0.3867 - val_accuracy: 0.9008 - val_loss: 0.3447


In [None]:
keras_prediction = keras_model.predict(X_test)
our_model_prediction = our_model.predict(X_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [None]:
if np.sum(np.abs(keras_prediction - our_model_prediction)) < 0.01:
    print('Test PASSED')
else:
    print('Something went wrong!')

Test PASSED


In [None]:
np.sum(np.abs(keras_prediction - our_model_prediction))

np.float64(0.000981925348501486)

### 2. Вычисление производных по входу для слоёв нейронной сети

#### 2.1  Реализация метода forward для класса CrossEntropy
Формула выглядит следующим образом: $$ crossentropy = L(p, y) =  - \sum\limits_i y_i log p_i, $$
где вектор $(p_1, ..., p_k) $ -  выход классификационного алгоритма, а $(y_1,..., y_k)$ - правильные метки класса в унарной кодировке (one-hot encoding)

In [None]:
class CrossEntropy(object):
    def __init__(self, eps=0.00001):
        self.name = 'CrossEntropy'
        self.eps = eps

    def forward(self, input_data, labels):
        out = np.zeros(input_data.shape[0])
        for i in range(input_data.shape[0]):
          labels[i] = np.clip(labels[i], self.eps, 1 - self.eps)
          out[i] = -np.sum(labels[i] * np.log(input_data[i]))
        return out
    def calculate_loss(self,input_data, labels):
        return self.forward(input_data, labels)

    def grad_x(self, input_data, lables):
        for i in range(input_data.shape[0]):
            lables[i] = np.clip(lables[i], self.eps, 1 - self.eps)
        return -lables / input_data

#### 2.2  Реализация метода grad_x класса CrossEntropy, который возвращает $\frac{\partial L}{\partial p}$

Проверить работоспособность кода поможет следующий тест:

In [None]:
def numerical_diff_net(net, x, labels):
    eps = 0.00001
    right_answer = []
    for i in range(len(x[0])):
        delta = np.zeros(len(x[0]))
        delta[i] = eps
        diff = (net.calculate_loss(x + delta, labels) - net.calculate_loss(x-delta, labels)) / (2*eps)
        right_answer.append(diff)
    return np.array(right_answer).T

def test_net(net):
    x = np.array([[1, 2, 3], [2, 3, 4]])
    labels = np.array([[0.3, 0.2, 0.5], [0.3, 0.2, 0.5]])
    num_grad = numerical_diff_net(net, x, labels)
    grad = net.grad_x(x, labels)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your gradiend is ')
        print(grad)

loss = CrossEntropy()
test_net(loss)

Test PASSED


#### 2.3  Реализация метода grad_x класса Softmax, который возвращает $\frac{\partial Softmax}{\partial x}$

Проверить работоспособность кода поможет следующий тест:

In [None]:
def numerical_diff_layer(layer, x):
    eps = 0.00001
    right_answer = []
    for i in range(len(x[0])):
        delta = np.zeros(len(x[0]))
        delta[i] = eps
        diff = (layer.forward(x + delta) - layer.forward(x-delta)) / (2*eps)
        right_answer.append(diff.T)
    return np.array(right_answer).T

def test_layer(layer):
    x = np.array([[1, 2, 3], [2, -3, 4]])
    num_grad = numerical_diff_layer(layer, x)
    grad = layer.grad_x(x)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your gradiend is ')
        print(grad)

layer = Softmax()
test_layer(layer)

Test PASSED


#### 2.4  Реализация метода grad_x для классов ReLU и DenseLayer

In [None]:
layer = ReLU()
test_layer(layer)

Test PASSED


In [None]:
layer = DenseLayer(3,4)
test_layer(layer)

Test PASSED


#### 2.5 Для класса Network реализуется метод grad_x, который должен осуществлять взятие производной от лосса по входу

In [None]:
net = Network([DenseLayer(3, 10), ReLU(), DenseLayer(10, 3), Softmax()], loss=CrossEntropy())
test_net(net)

Test PASSED


### 3. Реализация градиентов по параметрам и метода обратного распространения ошибки с обновлением парметров сети

#### 3.1  Реализация функции grad_b и grad_W.

In [None]:
def numerical_grad_b(input_size, output_size, b, W, x):
    eps = 0.00001
    right_answer = []
    for i in range(len(b)):
        delta = np.zeros(b.shape)
        delta[i] = eps
        dense1 = DenseLayer(input_size, output_size, W_init=W, b_init=b+delta)
        dense2 = DenseLayer(input_size, output_size, W_init=W, b_init=b-delta)
        diff = (dense1.forward(x) - dense2.forward(x)) / (2*eps)
        right_answer.append(diff.T)
    return np.array(right_answer).T

def test_grad_b():
    input_size = 3
    output_size = 4
    W_init = np.random.random((input_size, output_size))
    b_init = np.random.random((output_size,))
    x = np.random.random((2, input_size))

    dense = DenseLayer(input_size, output_size, W_init, b_init)
    grad = dense.grad_b(x)

    num_grad = numerical_grad_b(input_size, output_size, b_init, W_init, x)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your gradiend is ')
        print(grad)

test_grad_b()

Test PASSED


In [None]:
def numerical_grad_W(input_size, output_size, b, W, x):
    eps = 0.00001
    right_answer = []
    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
            delta = np.zeros(W.shape)
            delta[i, j] = eps
            dense1 = DenseLayer(input_size, output_size, W_init=W+delta, b_init=b)
            dense2 = DenseLayer(input_size, output_size, W_init=W-delta, b_init=b)
            diff = (dense1.forward(x) - dense2.forward(x)) / (2*eps)
            right_answer.append(diff.T)
    return np.array(right_answer).T

def test_grad_W():
    input_size = 3
    output_size = 4
    W_init = np.random.random((input_size, output_size))
    b_init = np.random.random((4,))
    x = np.random.random((2, input_size))

    dense = DenseLayer(input_size, output_size, W_init, b_init)
    grad = dense.grad_W(x)

    num_grad = numerical_grad_W(input_size, output_size, b_init, W_init, x)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your gradiend is ')
        print(grad)

test_grad_W()

Test PASSED


#### 3.2 Полная реализация метода обратного распространения ошибки в функции train_step класса Network


Сначала напишем реализацию функцим Network.grad_param(), которая возвращает список длиной в количество слоёв и элементом которого является список градиентов по параметрам.
После чего, имея список градиентов, напишем функцию обновления параметров для каждого слоя.

Далее пишется тест для кода подсчета градиента по параметрам, чтобы быть уверенным в том, что градиент через всю сеть считается правильно
    

#### 3.3 Запустим обучение модели. Если всё работает правильно, то точность на валидации должна будет возрастать

In [None]:
net = Network([DenseLayer(784, 10), Softmax()], loss=CrossEntropy())
trainX = X_train.reshape(len(X_train), -1)
net.fit(trainX[::3], Y_train[::3], validation_split=0.25,
            batch_size=16, nb_epoch=5, learning_rate=0.01)

100%|██████████| 937/937 [02:11<00:00,  7.10it/s]


1 epoch: val 0.72


100%|██████████| 937/937 [02:13<00:00,  7.02it/s]


2 epoch: val 0.80


100%|██████████| 937/937 [02:12<00:00,  7.05it/s]


3 epoch: val 0.83


100%|██████████| 937/937 [02:11<00:00,  7.10it/s]


4 epoch: val 0.84


100%|██████████| 937/937 [02:13<00:00,  7.00it/s]

5 epoch: val 0.85





In [None]:
net = Network([DenseLayer(784, 20), ReLU(), DenseLayer(20, 10), Softmax()], loss=CrossEntropy())
trainX = X_train.reshape(len(X_train), -1)
net.fit(trainX[::6], Y_train[::6], validation_split=0.25,
            batch_size=16, nb_epoch=5, learning_rate=0.001)

#### 3.5 Посмотрим на возможность нашей нейросети обучать более глубокие нейронные сети

In [None]:
# Сначала выдает ошибку деления на ноль, но потом все норм. На больше эпох не хватило времени :((
net = Network([DenseLayer(784, 50), ReLU(),
               DenseLayer(50, 40), ReLU(),
               DenseLayer(40, 20), ReLU(),
               DenseLayer(20, 10), Softmax()], loss=CrossEntropy())
trainX = X_train.reshape(len(X_train), -1)
net.fit(trainX, Y_train, validation_split=0.25,
            batch_size=128, nb_epoch=5, learning_rate=0.001)