In [27]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet

In [5]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

Downloading train-images-idx3-ubyte.gz ... 
Done
Converting train-images-idx3-ubyte.gz to NumPy Array ...
Done
Converting train-labels-idx1-ubyte.gz to NumPy Array ...
Done
Converting t10k-images-idx3-ubyte.gz to NumPy Array ...
Done
Converting t10k-labels-idx1-ubyte.gz to NumPy Array ...
Done
Creating pickle file ...
Done!


In [28]:
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]# 배치 사이즈 선정 0,1,2의 데이터만 불러옴
t_batch = t_train[:3]
grad_numerical = network.numerical_gradient(x_batch, t_batch) # 수치 미분 기울기
grad_backprop = network.gradient(x_batch, t_batch) # 오차역전파법 기울기

# 각 가중치의 절대 오차의 평균을 구한다.
for key in grad_numerical.keys(): #딕셔너리의 키값들을 모으는것
    #가중치 매개변수의 차이의 절댓값
    diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
    print(key + ":" + str(diff))
    # 보면 기울기 차이가 엄청 작다는것은 오차역전파법으로 구한 기울기와 수치 미분으로 구한기울기의
    #차이가 매우 적다는 것을 알 수 있다.

W1:6.066200085335175e-10
b1:3.756395308839207e-09
W2:8.690894735537518e-09
b2:1.4012953090392078e-07


In [29]:
# 하이퍼 파라미터
iters_num = 10000 #반복 횟수
train_size = x_train.shape[0]
print(x_train.shape)
print(train_size)
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

(60000, 784)
60000


In [31]:
# 에포치 선정
iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 기울기 계산
    #grad = network.numerical_gradient(x_batch, t_batch) # 수치 미분 방식
    grad = network.gradient(x_batch, t_batch) # 오차역전파법 방식(훨씬 빠르다)
    
    # 갱신
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key] # 오차역전파법으로 구한 기울기를 빼준다.
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print('i: {:4d}\tTrain acc: {:.5f}\tTest acc: {:.5f}\tLoss: {:f}'.format(i,train_acc,test_acc,loss))

i:    0	Train acc: 0.13107	Test acc: 0.13510	Loss: 2.296889
i:  600	Train acc: 0.90578	Test acc: 0.90890	Loss: 0.270781
i: 1200	Train acc: 0.92403	Test acc: 0.92690	Loss: 0.212668
i: 1800	Train acc: 0.93620	Test acc: 0.93740	Loss: 0.182416
i: 2400	Train acc: 0.94532	Test acc: 0.94390	Loss: 0.104016
i: 3000	Train acc: 0.95263	Test acc: 0.95100	Loss: 0.143427
i: 3600	Train acc: 0.95767	Test acc: 0.95570	Loss: 0.070179
i: 4200	Train acc: 0.96168	Test acc: 0.95670	Loss: 0.061682
i: 4800	Train acc: 0.96492	Test acc: 0.96140	Loss: 0.051916
i: 5400	Train acc: 0.96792	Test acc: 0.96270	Loss: 0.116484
i: 6000	Train acc: 0.96953	Test acc: 0.96540	Loss: 0.122131
i: 6600	Train acc: 0.97262	Test acc: 0.96700	Loss: 0.061088
i: 7200	Train acc: 0.97445	Test acc: 0.96750	Loss: 0.047682
i: 7800	Train acc: 0.97612	Test acc: 0.96880	Loss: 0.058185
i: 8400	Train acc: 0.97725	Test acc: 0.96950	Loss: 0.107972
i: 9000	Train acc: 0.97883	Test acc: 0.97000	Loss: 0.093513
i: 9600	Train acc: 0.97952	Test acc: 0.9