In [1]:
import mnist
import numpy as np
from tqdm.auto import tqdm

W - [W_L of shape (num_inputs, num_neurons)] of length l

In [2]:
train_images = mnist.train_images()
imean = np.mean(train_images, axis=0)
istd = np.std(train_images, axis=0)
istd[istd == 0] = 1
train_images = (train_images - imean) / istd
# train_images = train_images / 255

train_labels = mnist.train_labels()

In [3]:
train_images.shape

(60000, 28, 28)

In [4]:
N1 = 128
N2 = 10
W = [np.random.uniform(-1, 1, size=(28 * 28, N1)), np.random.uniform(-1, 1, size=(N1, N2))]
B = [np.random.uniform(-1, 1, size=(N1,)), np.random.uniform(-1, 1, size=(N2,))]

In [5]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [6]:
def forward(x):
    y = np.array(x.ravel())
    y_arr = [y]
    for l in range(2):
        v = W[l].T @ y + B[l]
        if l == 0:
            y = np.tanh(v)
        else:
            y = softmax(v)
        y_arr.append(y)
    return y_arr

In [7]:
forward(train_images[5])

[array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -4.41807799e-03, -5.75481961e-03, -4.08251693e-03, -4.08251693e-03,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -4.08251693e-03, -4.70968827e-03, -8.79934640e-03, -1.15905591e-02,
        -1.47589798e-02, -1.92848546e-02, -2.46717975e-02, -2.90710271e-02,
        -3.05926583e-02, -3.11640127e-02, -3.19628719e-02, -3.02025355e-02,
        -3.13102175e-02, -2.83833960e-02, -2.31191906e-02, -1.91666260e-02,
        -1.67723008e-02, -1.09963601e-02, -8.32486080e-03, -4.38069356e-03,
         0.0

In [8]:
def evaluate():
    correct = 0
    loss = []
    for x, l in tqdm(zip(train_images, train_labels), total=train_images.shape[0]):
        f = forward(x)
        y = f[2]
        d = np.zeros(10)
        d[l] = 1
        loss.append((d - y) ** 2)
        y = np.argmax(y)
        if y == l:
            correct += 1
    # MSE loss
    return np.mean(loss), correct * 100 / train_images.shape[0]

In [9]:
evaluate()

  0%|          | 0/60000 [00:00<?, ?it/s]

(0.1590329168051026, 8.555)

In [10]:
def epoch(eta=0.001):
    it_indices = np.arange(train_images.shape[0])
    np.random.shuffle(it_indices)
    for i in tqdm(it_indices, total=it_indices.shape[0]):
        x = train_images[i]
        l = train_labels[i]
        f = forward(x)
        d = np.zeros(10)
        d[l] = 1
        # shape - (10,)
        delta_2 = (d - f[2]) * f[2] * (1 - f[2])
        # shape - (128,)
        delta_1 = (1 - f[1] ** 2) * (W[1] @ delta_2)
        del_W_1 = eta * np.outer(f[0], delta_1)
        del_B_1 = eta * delta_1
        del_W_2 = eta * np.outer(f[1], delta_2)
        del_B_2 = eta * delta_2
        W[0] += del_W_1
        B[0] += del_B_1
        W[1] += del_W_2
        B[1] += del_B_2
    return

In [None]:
for _ in range(5):
    epoch(1e-2)
    print(evaluate())

  0%|          | 0/60000 [00:00<?, ?it/s]

  0%|          | 0/60000 [00:00<?, ?it/s]

(0.02936698327764336, 81.4)


  0%|          | 0/60000 [00:00<?, ?it/s]

  0%|          | 0/60000 [00:00<?, ?it/s]

(0.020799643208663843, 86.655)


  0%|          | 0/60000 [00:00<?, ?it/s]

  0%|          | 0/60000 [00:00<?, ?it/s]

(0.017340237231933984, 88.89833333333333)


  0%|          | 0/60000 [00:00<?, ?it/s]

  0%|          | 0/60000 [00:00<?, ?it/s]

(0.015346565455352125, 90.115)


  0%|          | 0/60000 [00:00<?, ?it/s]

In [24]:
np.zeros(10).dtype

dtype('float64')

In [27]:
mnist.train_images().dtype

dtype('uint8')