In [34]:
import numpy as np

def relu(X):
    return np.maximum(0, X)

def softmax(X):
    X = X - np.max(X, axis=1, keepdims=True)
    return np.exp(X) / np.sum(np.exp(X), axis=1, keepdims=True)

def relu_backward(Z, delta):
    delta[Z == 0] = 0

def cross_entropy_error(y, t):
    batch_size = y.shape[0]
    return -np.sum(t * np.log(y + 1e-7) / batch_size)

In [80]:
class FullyConnectedNeuralNetwork():
    def __init__(self, layer_units):
        '''
        layer_units: list, 各層のノード数を格納したリスト
        '''
        self.n_iter = 0
        self.t_ = 0
        self.layer_units = layer_units
        self.n_layers_ = len(layer_units)

        # パラメータの初期化
        self.coefs_ = []
        self.intercepts_ = []
        for i in range(self.n_layers_ - 1):
            # coef_init, intercept_init = self._init_coef(( ア ))
            coef_init, intercept_init = self._init_coef(layer_units[i], layer_units[i + 1])
            self.coefs_.append(coef_init)
            self.intercepts_.append(intercept_init)
        
        # 勾配の初期化
        self.coef_grads_ = [np.empty((n_in_, n_out_)) for n_in_, n_out_ in zip(layer_units[:-1], layer_units[1:])]
        self.intercept_grads_ = [np.empty(n_out_) for n_out_ in layer_units[1:]]
    
    def _init_coef(self, n_in, n_out):
        '''
        ある層間のパラメータを初期化するメソッド
        n_in: int, 入力側のノード数
        n_out: int, 出力側のノード数
        '''
        std = np.sqrt(2 / n_in)
        coef_init = np.random.randn(n_in, n_out) * std
        intercept_init = np.zeros(n_out)
        return coef_init, intercept_init
    
    def _forward(self, activations):
        '''
        順伝播処理を行うメソッド
        activartions: list, 各層の出力を納めたリスト
                     activation[0]は入力データ
                     activation[i].shape=(バッチサイズ、ノード数)
        '''
        affine = [None] * (self.n_layers_ - 1)
        for i in range(self.n_layers_ - 1):
            # アフィン変換
            affine[i] = np.dot(activations[i], self.coefs_[i]) + self.intercepts_[i]

            # if (i + 1) == (( イ )):
            if (i + 1) == (self.n_layers_ - 1):
                '''
                出力層の場合
                '''
                activations[i + 1] = softmax(affine[i])
            else:
                '''
                隠れ層の場合
                '''
                activations[i + 1] = relu(affine[i])

        return activations
    
    def _grad(self, j, activations, deltas):
        '''
        各パラメータの勾配を算出するメソッド
        j: int, アフィンの番号
        activations: list, 各層の出力を納めたメソッド
        deltas: list, 出力層側から伝わってきた勾配を納めたリスト
        '''
        # self.coef_grads_[j] = ( ウ )
        self.coef_grads_[j] = np.dot(activations[j].T, deltas[j])
        # self.intercept_grads_[j] = ( エ )
        self.intercept_grads_[j] = np.sum(deltas[j], axis=0)
    
    def _backward(self, t, activations):
        '''
        逆伝播処理を行うメソッド
        t: array-like, 正解ラベル, t.shape=(バッチサイズ、出力層ノード数)
        activations: list, 各層の出力を納めたリスト
        '''
        deltas = [None] * (self.n_layers_ - 1)
        last = self.n_layers_ - 2
    
        # 交差エントロピー誤差とソフトマックス関数を合わせて勾配を算出
        n_samples = t.shape[0]
        # deltas[last] = ( オ )
        deltas[last] = (activations[-1] - t) / n_samples

        # 出力層の1つ手前のパラメータの勾配を算出
        # self._grad(( カ ), activations, deltas)
        self._grad(last, activations, deltas)

        # 残りのパラメータの勾配を算出
        for i in range(self.n_layers_ - 2, 0, -1):
            # 入力(activations)の勾配を算出
            # deltas[i - 1] = ( キ )
            deltas[i - 1] = np.dot(deltas[i], self.coefs_[i].T)

            # 活性化関数ReLuの勾配を算出
            # relu_backward(( ク ), deltas[i - 1])
            relu_backward(activations[i], deltas[i - 1])

            # パラメータの勾配を算出
            # self._grad(( ケ ), activations, deltas)
            self._grad(i - 1, activations, deltas)
        
        return
    
    def _forward_and_backward(self, x, t):
        '''
        順伝播処理を実行した後、逆伝播処理を実行するメソッド
        x: array-like, 入力データ, x.shape=(バッチサイズ、入力層ノード数)
        t: array-like, 正解ラベル, t.shape=(バッチサイズ、出力層ノード数)
        '''
        activations = [x] + [None] * (self.n_layers_ - 1)

        # 順伝播
        activations = self._forward(activations)
        loss = cross_entropy_error(activations[-1], t)

        # 逆伝播
        self._backward(t, activations)

        return loss

In [81]:
input_size = 4
hidden_size = 5
output_size = 3
model = FullyConnectedNeuralNetwork([input_size, hidden_size, output_size])

In [82]:
model.coef_grads_

[array([[-0.69807499, -0.7039001 , -0.24161202, -0.64442406, -0.90971141],
        [ 0.03167245,  0.300309  , -0.91035939,  0.30489739, -0.98991501],
        [ 0.15885928, -0.93877761,  0.41244743, -0.26122254, -0.42368744],
        [-0.09389548, -0.43755282, -0.44632541, -1.43483421,  0.54631262]]),
 array([[ 0.5306221 ,  1.02066049,  0.52098645],
        [ 1.09989768,  1.8157602 ,  1.64059741],
        [-0.63043139, -0.39035527,  0.73134284],
        [ 0.05149311,  0.71845009,  0.8621749 ],
        [ 1.03280966,  1.31582013, -0.57742054]])]

In [83]:
model.intercept_grads_

[array([3.6563664e-316, 0.0000000e+000, 0.0000000e+000, 0.0000000e+000,
        0.0000000e+000]),
 array([3.85156186e-316, 2.14465755e-316, 3.65634506e-316])]

In [84]:
batch_size = 5
x = np.random.randn(batch_size, input_size)
t = np.random.randn(batch_size, output_size)

In [85]:
model._forward_and_backward(x, t)

-0.08504708986421507