# ゼロから作るdeeplearning 5章 誤差逆伝搬法

## 5.1 計算グラフ
計算グラフめっちゃわかりやすいなあ。

急に道ゆく人に誤差逆伝搬法ってなんですか？って聞かれて、紙とペンさえ持っていれば説明できるくらいになれば理解したって言えると思う。

In [50]:
import numpy as np

## 5.4 単純なレイヤの実装
### 5.4.1 乗算レイヤの実装

In [51]:
class MulLayer:
	def __init__(self):
		self.x = None
		self.y = None
	
	def forward(self, x, y):
		self.x = x
		self.y = y
		out = x * y
		return out

	def backward(self, dout):	# doutは逆伝播してきた微分
		dx = dout * self.y
		dy = dout * self.x
		return dx, dy

In [52]:
apple = 100
apple_num = 2
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

print(price)

220.00000000000003


In [53]:
# backward（forwardの時と逆順に呼び出していく）
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
print(dapple, dapple_num, dtax)

2.2 110.00000000000001 200


### 5.4.2 加算レイヤの実装


In [54]:
class AddLayer:
	def __init__(self):
		pass

	def forward(self, x, y):
		out = x + y
		return out

	def backward(self, dout):
		dx = dout * 1
		dy = dout * 1
		return dx, dy

In [55]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

# back_propagation
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(price)
print(dapple_num, dapple, dorange, dorange_num, dtax)

715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


## 5.5 活性化関数レイヤの実装
### 5.5.1 ReLUレイヤ

In [56]:
class ReLU:
	def __init__(self):
		self.mask = None

	def forward(self, x):
		self.mask = (x <= 0)
		out = x.copy()
		out[self.mask] = 0
		return out

	def backward(self, dout):
		dout[self.mask] = 0
		dx = dout
		return dx

In [57]:
x = np.array([[1.0, -0.5], [-2.0, 3.0]])
print(x)

[[ 1.  -0.5]
 [-2.   3. ]]


### 5.5.2 Sigmoidレイヤ

$$
\sigma'(x) = \sigma(x) \cdot (1 - \sigma(x))
$$
これは、シグモイド関数の出力を$y$とした場合、その微分が$y \cdot (1 - y)$になることを意味します。

In [58]:
class Sigmoid:
	def __init__(self):
		self.out = None

	def forward(self, x):
		out = 1 / (1 + np.exp(-x))
		self.out = out
		return out

	def backward(self, dout):
		dx = dout * (1.0 - self.out) * self.out
		return dx

## 5.6 Affine / Softmaxレイヤの実装
### 5.6.1 Affineレイヤ
### 5.6.2 バッチ版Affineレイヤ

In [59]:
# テンソル(4次元データ)に対応しているバージョンは参考資料に
class Affine:
	def __init__(self, W, b):
		self.W = W
		self.b = b
		self.x = None
		self.dW = None
		self.db = None

	def forward(self, x):
		self.x = x
		out = np.dot(x, self.W) + self.b
		return out

	def backward(self, dout):
		dx = np.dot(dout, self.W.T)
		self.dW = np.dot(self.x.T, dout)
		self.db = np.sum(dout, axis=0)
		return dx

### 5.6.3 Softmax-with-Lossレイヤ

In [60]:
# 再掲：ソフトマックス関数（オーバーフロー対策済）
def softmax(a):
	c = np.max(a)
	exp_a = np.exp(a - c)
	sum_exp_a = np.sum(exp_a)
	return exp_a / sum_exp_a

# バッチ対応版の交差エントロピー誤差を実装する（one-hot表現の場合）
def cross_entropy_error(y, t):
	# 1次元の場合は、reshapeで2次元に変換する
	if y.ndim == 1:
		t = t.reshape(1, t.size)
		y = y.reshape(1, y.size)
	batch_size = y.shape[0]
	return -np.sum(t * np.log(y + 1e-7)) / batch_size

In [61]:
class SoftmaxWithLoss:
	def __init__(self):
		self.loss = None
		self.y = None
		self.t = None

	def forward(self, x, t):
		self.t = t
		self.y = softmax(x)
		self.loss = cross_entropy_error(self.y, self.t)
		return self.loss

	def backward(self, dout=1):
		batch_size = self.t.shape[0]
		dx = (self.y - self.t) / batch_size
		return dx

# 5.7 誤差逆伝播法の実装

In [62]:
import numpy as np
from collections import OrderedDict

class TwoLayerNet:
	def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
		self.params = {}
		self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
		self.params['b1'] = np.zeros(hidden_size)
		self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
		self.params['b2'] = np.zeros(output_size)

		self.layers = OrderedDict()
		self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
		self.layers['Relu1'] = ReLU()
		self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

		self.lastLayer = SoftmaxWithLoss()
	
	def predict(self, x):
		for layer in self.layers.values():
			x = layer.forward(x)
		return x

	def loss(self, x, t):
		y = self.predict(x)
		return self.lastLayer.forward(y, t)
	
	def accuracy(self, x, t):
		y = self.predict(x)
		y = np.argmax(y, axis=1)
		if t.ndim != 1:
			t = np.argmax(t, axis=1)
		accuracy = np.sum(y == t) / float(x.shape[0])
		return accuracy
	
	def numerical_gradient(self, x, t):
		loss_W = lambda W: self.loss(x, t)
		grads = {}
		grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
		grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
		grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
		grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
		return grads

	def gradient(self, x, t):
		self.loss(x, t)

		dout = 1
		dout = self.lastLayer.backward(dout)

		layers = list(self.layers.values())
		layers.reverse()
		for layer in layers:
			dout = layer.backward(dout)

		grads = {}
		grads['W1'] = self.layers['Affine1'].dW
		grads['b1'] = self.layers['Affine1'].db
		grads['W2'] = self.layers['Affine2'].dW
		grads['b2'] = self.layers['Affine2'].db
		return grads

### 5.7.3 誤差逆伝播法の勾配確認

In [63]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# TensorFlowのデータセットAPIを使ってMNISTデータセットを読み込む
mnist = tf.keras.datasets.mnist

# データセットを訓練データとテストデータに分割
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# flatten処理を適用して、28x28の画像を1次元配列（784次元）に変換
x_train = x_train.reshape(-1, 28*28)
x_test = x_test.reshape(-1, 28*28)

# 画像データを標準化（0から1の範囲にスケーリング）
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# ラベルデータをone-hotエンコード
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# データの形状を確認
print(f'x_train shape: {x_train.shape}, y_train shape: {y_train.shape}')
print(f'x_test shape: {x_test.shape}, y_test shape: {y_test.shape}')


x_train shape: (60000, 784), y_train shape: (60000, 10)
x_test shape: (10000, 784), y_test shape: (10000, 10)


In [64]:
# ハイパーパラメータ
iters_num = 10000  # 繰り返しの回数
train_size = x_train.shape[0]
batch_size = 100  # バッチサイズ
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []
# 1epocあたりの繰り返し回数
iter_per_epoch = max(train_size / batch_size, 1)

# 先ほど定義した2層のネットワークを使用
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

for i in range(iters_num):
	# ミニバッチの取得
	batch_mask = np.random.choice(train_size, batch_size)
	x_batch = x_train[batch_mask]
	y_batch = y_train[batch_mask]

	# 勾配の計算（逆伝搬法に変更している！！！！）
	grad = network.gradient(x_batch, y_batch)

	# パラメータの更新
	for key in ('W1', 'b1', 'W2', 'b2'):
		network.params[key] -= learning_rate * grad[key]

	# 学習経過の記録
	loss = network.loss(x_batch, y_batch)
	train_loss_list.append(loss)

	# 1エポックごとに認識精度を計算
	if i % iter_per_epoch == 0:
		train_acc = network.accuracy(x_train, y_train)
		test_acc = network.accuracy(x_test, y_test)
		train_acc_list.append(train_acc)
		test_acc_list.append(test_acc)
		print(f'epoch: {i//iter_per_epoch}, train acc: {train_acc:.4f}, test acc: {test_acc:.4f}')

epoch: 0.0, train acc: 0.1103, test acc: 0.1053
epoch: 1.0, train acc: 0.0987, test acc: 0.0980
epoch: 2.0, train acc: 0.1022, test acc: 0.1010
epoch: 3.0, train acc: 0.0987, test acc: 0.0980


  exp_a = np.exp(a - c)


epoch: 4.0, train acc: 0.0987, test acc: 0.0980
epoch: 5.0, train acc: 0.0987, test acc: 0.0980
epoch: 6.0, train acc: 0.0987, test acc: 0.0980
epoch: 7.0, train acc: 0.0987, test acc: 0.0980
epoch: 8.0, train acc: 0.0987, test acc: 0.0980
epoch: 9.0, train acc: 0.0987, test acc: 0.0980
epoch: 10.0, train acc: 0.0987, test acc: 0.0980
epoch: 11.0, train acc: 0.0987, test acc: 0.0980
epoch: 12.0, train acc: 0.0987, test acc: 0.0980
epoch: 13.0, train acc: 0.0987, test acc: 0.0980
epoch: 14.0, train acc: 0.0987, test acc: 0.0980
epoch: 15.0, train acc: 0.0987, test acc: 0.0980
epoch: 16.0, train acc: 0.0987, test acc: 0.0980
