In [1]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

In [7]:
class TwoLayerNet:
    
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
    
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        
        for layer in layers:
            dout = layer.backward(dout)
            
        # set
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

#### 梯度确认

In [11]:
from dataset.mnist import load_mnist
import time

In [16]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

indexes = np.random.choice(50000, 100)
print(indexes)
x_batch = x_train[indexes, :]
t_batch = t_train[indexes, :]

time1 = time.time()
grad_numerical = network.numerical_gradient(x_batch, t_batch)
time2 = time.time()
grad_backprop = network.gradient(x_batch, t_batch)
time3 = time.time()
numerical_time = time2  -time1
bp_time = time3  -time3


print("numerical_time : " + str(numerical_time))
print("BP_time : " + str(bp_time))
# Mean error

for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ':' + str(diff))

[48550  1559  2627  7831  3742 14589  3130 39018  4416 42500 41053 13379
 27479 36670  6505 44528 24121  9818  3254  2269 40879 29709 39629 24468
 28501 45938 41638  2909 33246 40749 43736 37237 10336  7664 43117 44761
 49548 34426 41596 31501 42796 16867 34454 25814   814 31485 28362 47017
 23105 10979  9700 40343 10147 45716 41095 43231 39060 42690 33673  6636
 39915 24206 22279 31214 41876 10652 45907  8992 47157 44460 32565 25877
 39505 21124 45081 24845 14457  5198  2434 31109 13405 37339 22217 35432
 49643 44285 31154  4410  4624 28072 24364 42730 28115 40788  9919 11892
  2457 37376  6745 43163]
numerical_time : 27.76185441017151
BP_time : 0.0
W1:4.260218857633593e-07
b1:3.5495850382466573e-06
W2:1.8631792694221478e-09
b2:2.195566202833725e-08


In [25]:
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # gradient:
    grad = network.gradient(x_batch, t_batch)
    
    # update:
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
        
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % (iter_per_epoch / 4) == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(loss, train_acc, test_acc)

0.007309933796620397 0.99865 0.9742
0.005026910107174594 0.9988833333333333 0.9745
0.0064488773844279335 0.9988333333333334 0.9751
0.007542571474249563 0.9988833333333333 0.9749
0.006167556815645264 0.9989166666666667 0.9748
0.0070022759872610295 0.9986333333333334 0.9752
0.007571945699918553 0.9990333333333333 0.9751
0.004933892746092766 0.9986833333333334 0.974
0.0023132983469707825 0.9988833333333333 0.9748
0.00293023534058304 0.99895 0.974
0.007708118955182428 0.9990833333333333 0.9742
0.01478168765111227 0.9989333333333333 0.9744
0.010026757899861427 0.9990833333333333 0.9744
0.007554113899386757 0.999 0.9749
0.01024009711515909 0.9991166666666667 0.9749
0.0023094473887613366 0.99855 0.9746
0.007085184946794808 0.9988166666666667 0.9744
0.005082751855548628 0.9989666666666667 0.9747
0.011319490732121504 0.9992166666666666 0.9753
0.004172708723333639 0.9987 0.9738
0.008020747836345331 0.9987 0.9748
0.0021294373502444624 0.9988833333333333 0.9746
0.004413340644653655 0.9990666666666