In [1]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

In [2]:
class TwoLayerNet:
    
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
    
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        
        for layer in layers:
            dout = layer.backward(dout)
            
        # set
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

#### 梯度确认

In [3]:
from dataset.mnist import load_mnist
import time

In [4]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

indexes = np.random.choice(50000, 100)
print(indexes)
x_batch = x_train[indexes, :]
t_batch = t_train[indexes, :]

time1 = time.time()
grad_numerical = network.numerical_gradient(x_batch, t_batch)
time2 = time.time()
grad_backprop = network.gradient(x_batch, t_batch)
time3 = time.time()
numerical_time = time2  -time1
bp_time = time3  -time3


print("numerical_time : " + str(numerical_time))
print("BP_time : " + str(bp_time))
# Mean error

for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ':' + str(diff))

[16107 12781 12557  6049 34402 15843 48445  9072 21290 47734 10041 35318
 35004 10790 17542 35702 37550  6953  4679 37079 45415 47877 17154 42669
 36933 28818 35494  9320 49256 29335 17161 23977 31298 23371 37776 13468
  8766 13352 17169 46303 49629 10099 25365 45289 37873 37886  9703 28121
 37609  6314  2915 28608 16053  3551 18577 19276 30369  3002  3194 48871
  3723 25163 37406  2936 39485  5863 35043 31025  8094 20687 10231 35207
 23834 48354 16614  1324 15476 28410 30090 26092  6707 26295 28565 19802
 40082 28971 25088 45074 27760 17889 12838 28084 12597 43536 20113 25686
 15944 26128 24606 20128]
numerical_time : 84.00448298454285
BP_time : 0.0
W1:1.7600107295764059e-07
b1:2.1474304002527827e-06
W2:1.7126002678570633e-09
b2:2.006908181983319e-08


In [5]:
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # gradient:
    grad = network.gradient(x_batch, t_batch)
    
    # update:
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
        
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % (iter_per_epoch / 4) == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(loss, train_acc, test_acc)

2.3025260178134186 0.04858333333333333 0.0501
0.6733151602256428 0.8089833333333334 0.8098
0.47639194390273326 0.8799333333333333 0.8836
0.3143458453912024 0.8959333333333334 0.8978
0.2283272557276022 0.9036166666666666 0.9089
0.21045668149406482 0.9121666666666667 0.9154
0.15755812587387696 0.9119166666666667 0.9127
0.20717389675586056 0.9182666666666667 0.9195
0.2828981964389914 0.9230833333333334 0.9247
0.1986073866460853 0.9283833333333333 0.9295
0.21721443967296225 0.9324666666666667 0.9314
0.2954557405700793 0.9341 0.9335
0.1825131274001982 0.93525 0.9339
0.3063480638980788 0.9395 0.9376
0.1685143282054244 0.9415333333333333 0.9399
0.2008980707383593 0.9434 0.9391
0.13086515328498016 0.94685 0.945
0.18851917454882625 0.9485666666666667 0.9451
0.24403248354521104 0.9493 0.948
0.14917134199727364 0.9510166666666666 0.9493
0.11251579918281532 0.952 0.9497
0.1564599309017001 0.9532166666666667 0.9496
0.1170702554889439 0.9555333333333333 0.9515
0.1249576208537722 0.95575 0.9536
0.168