In [1]:
import numpy as np

def relu(X):
    return np.maximum(0, X)

def softmax(X):
    X = X - np.max(X, axis=1, keepdims=True)
    return np.exp(X) / np.sum(np.exp(X), axis=1, keepdims=True)

def relu_backward(Z, delta):
    delta[Z == 0] = 0

def cross_entropy_error(y, t):
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7) / batch_size)

In [26]:
class FullyConnectedNeuralNetwork():
    def __init__(self, layer_units):
        '''
        layer_units: list, 各層のノード数を格納したリスト
        '''
        self.n_iter = 0
        self.t_ = 0
        self.layer_units = layer_units
        self.n_layers_ = len(layer_units)

        # パラメータの初期化
        self.coefs_ = []
        self.intercepts_ = []
        for i in range(self.n_layers_ - 1):
            # coef_init, intercept_init = self._init_coef(( ア ))
            coef_init, intercept_init = self._init_coef(layer_units[i], layer_units[i + 1])
            self.coefs_.append(coef_init)
            self.intercepts_.append(intercept_init)
        
        # 勾配の初期化
        self.coef_grads_ = [np.empty((n_in_, n_out_)) for n_in_, n_out_ in zip(layer_units[:-1], layer_units[1:])]
        self.intercept_grads_ = [np.empty(n_out_) for n_out_ in layer_units[1:]]
    
    def _init_coef(self, n_in, n_out):
        '''
        ある層間のパラメータを初期化するメソッド
        n_in: int, 入力側のノード数
        n_out: int, 出力側のノード数
        '''
        std = np.sqrt(2 / n_in)
        coef_init = np.random.randn(n_in, n_out) * std
        intercept_init = np.zeros(n_out)
        return coef_init, intercept_init
    
    def _forward(self, activations):
        '''
        順伝播処理を行うメソッド
        activartions: list, 各層の出力を納めたリスト
                     activation[0]は入力データ
                     activation[i].shape=(バッチサイズ、ノード数)
        '''
        affine = [None] * (self.n_layers_ - 1)
        for i in range(self.n_layers_ - 1):
            # アフィン変換
            affine[i] = np.dot(activations[i], self.coefs_[i]) + self.intercepts_[i]

            # if (i + 1) == (( イ )):
            if (i + 1) == (self.n_layers_ - 1):
                '''
                出力層の場合
                '''
                activations[i + 1] = softmax(affine[i])
            else:
                '''
                隠れ層の場合
                '''
                activations[i + 1] = relu(affine[i])

        return activations
    
    def _grad(self, j, activations, deltas):
        '''
        各パラメータの勾配を算出するメソッド
        j: int, アフィンの番号
        activations: list, 各層の出力を納めたメソッド
        deltas: list, 出力層側から伝わってきた勾配を納めたリスト
        '''
        # self.coef_grads_[j] = ( ウ )
        self.coef_grads_[j] = np.dot(activations[j].T, deltas[j])
        # self.intercept_grads_[j] = ( エ )
        self.intercept_grads_[j] = np.sum(deltas[j], axis=0)
    
    def _backward(self, t, activations):
        '''
        逆伝播処理を行うメソッド
        t: array-like, 正解ラベル, t.shape=(バッチサイズ、出力層ノード数)
        activations: list, 各層の出力を納めたリスト
        '''
        deltas = [None] * (self.n_layers_ - 1)
        last = self.n_layers_ - 2
    
        # 交差エントロピー誤差とソフトマックス関数を合わせて勾配を算出
        n_samples = t.shape[0]
        # deltas[last] = ( オ )
        deltas[last] = (activations[-1] - t) / n_samples

        # 出力層の1つ手前のパラメータの勾配を算出
        # self._grad(( カ ), activations, deltas)
        self._grad(last, activations, deltas)

        # 残りのパラメータの勾配を算出
        for i in range(self.n_layers_ - 2, 0, -1):
            # 入力(activations)の勾配を算出
            # deltas[i - 1] = ( キ )
            deltas[i - 1] = np.dot(deltas[i], self.coefs_[i].T)

            # 活性化関数ReLuの勾配を算出
            # relu_backward(( ク ), deltas[i - 1])
            relu_backward(activations[i], deltas[i - 1])

            # パラメータの勾配を算出
            # self._grad(( ケ ), activations, deltas)
            self._grad(i - 1, activations, deltas)
        
        return
    
    def _forward_and_backward(self, x, t):
        '''
        順伝播処理を実行した後、逆伝播処理を実行するメソッド
        x: array-like, 入力データ, x.shape=(バッチサイズ、入力層ノード数)
        t: array-like, 正解ラベル, t.shape=(バッチサイズ、出力層ノード数)
        '''
        activations = [x] + [None] * (self.n_layers_ - 1)

        # 順伝播
        activations = self._forward(activations)
        loss = cross_entropy_error(activations[-1], t)

        # 逆伝播
        self._backward(t, activations)

        return loss

In [27]:
input_size = 4
hidden_size1 = 5
hidden_size2 = 6
output_size = 3
model = FullyConnectedNeuralNetwork([input_size, hidden_size1, hidden_size2, output_size])

In [28]:
model.coef_grads_

[array([[-1.19897959, -0.46702807,  0.52269926, -1.20833537, -0.1275274 ],
        [-0.94053577, -1.58957394,  2.31645543,  0.51739648, -1.79909498],
        [-0.85631736,  0.43865856,  0.40488838,  0.61185538,  1.82566929],
        [ 1.48899123, -2.45059145, -0.79075758, -0.21225175,  0.80824117]]),
 array([[ 0.72298665,  0.94782164, -0.40360625, -0.55172785,  1.04519843,
          0.35008679],
        [-0.74859471,  2.13167966, -0.91472293,  1.68269645,  0.84424921,
         -0.86993306],
        [-1.28074674, -0.88278232,  1.24007604,  1.20207476, -0.24599205,
          0.25912972],
        [ 0.21588501, -1.63960296,  1.22335373, -1.20307058, -0.2938278 ,
          0.72623751],
        [ 1.69887768,  0.21116005,  1.39495492,  0.01233449,  0.91227218,
          0.78441844]]),
 array([[-1.40769308, -1.12623854,  1.2356705 ],
        [ 0.92650778, -1.13727534,  1.58293537],
        [ 1.54833549, -0.03011686, -0.77829346],
        [ 1.0279668 ,  0.63063554, -0.16355393],
        [-2.393

In [29]:
model.intercept_grads_

[array([3.73968564e-316, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000]),
 array([2.06777698e-316, 2.19229575e-316, 2.88174341e-316, 2.88173867e-316,
        2.06777817e-316, 2.06767619e-316]),
 array([2.20397704e-316, 2.33419537e-312, 5.73116149e-322])]

In [30]:
batch_size = 5
x = np.random.randn(batch_size, input_size)
t = np.random.randn(batch_size, output_size)

In [31]:
model._forward_and_backward(x, t)

(5, 4)
(5, 5)
(5, 6)
(5, 3)
backward2
backward1


0.39887732817503485