In [46]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [47]:
np.set_printoptions(threshold=np.inf)

### 准备数据

In [61]:
data = pd.read_csv("mnist_train.csv")
data = shuffle(data)
data.iloc[:,1:] = data.iloc[:,1:] / 255 

In [62]:
train_x = np.array(data.iloc[:50000,1:]).T
test_x = np.array(data.iloc[50000:60000,1:]).T
train_y = np.array(data.iloc[:50000,0]).T.reshape(1,-1)
test_y = np.array(data.iloc[50000:60000,0]).T.reshape(1,-1)

In [63]:
def onehotencoding(x):
    x = np.broadcast_to(x,(10,x.shape[1])) 
    x_ = x.copy()
    for i, row in enumerate(x):
        for j, item in enumerate(row):
            if item == i:
                x_[i,j] = 1
            else:
                x_[i,j] = 0
    return x_

# 其他写法
def one_hot_(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [64]:
train_y = onehotencoding(train_y)
test_y = onehotencoding(test_y)

### 定义函数、参数

In [65]:
def init_parameters():
    w1 = np.random.uniform(low=-0.5, high=0.5, size=(10,784))
    b1 = np.random.uniform(low=-0.5, high=0.5, size=(10,1))
    w2 = np.random.uniform(low=-0.5, high=0.5, size=(10,10))
    b2 = np.random.uniform(low=-0.5, high=0.5, size=(10,1))
    return w1, b1, w2, b2

In [66]:
def ReLU(x):
    return np.maximum(0,x)

def softmax(x):
    soft_x = (np.random.rand(x.shape[0], x.shape[1])).T
    for i, row in enumerate(x.T):
        for j, item in enumerate(row):
            soft_x[i,j] = np.exp(item)
    sum_exp = np.sum(soft_x, axis = 1).reshape(-1,1)
    for i, row in enumerate(soft_x):
        for j, item in enumerate(row):
            soft_x[i,j] = item / sum_exp[i]
    soft_x = soft_x.T 
    return soft_x

# 其他写法
def softmax_1(Z):
    A = np.exp(Z) / np.sum(np.exp(Z), axis = 0) # 对列求和
    return A

def softmax_2(Z):
    A = np.exp(Z) / sum(np.exp(Z)) # sum默认对列求和
    return A

In [67]:
def forward(x, w1, b1, w2, b2):
    z1 = np.dot(w1, x) + b1
    a1 = ReLU(z1)
    z2 = np.dot(w2, a1) + b2
    a2 = softmax(z2)
    return a1, a2, z1, z2

In [68]:
def predict(a2):
    y_pre = np.random.rand(a2.shape[0], a2.shape[1]).T
    for n, i in enumerate(a2.T):
        y_pre[n, np.argmax(i)] = 1
    for j, row in enumerate(y_pre):
        for k, column in enumerate(row):
            if column != 1:
                y_pre[j,k] = 0
    y_pre = y_pre.T
    return y_pre

In [69]:
def accuracy(y_pre, y):
    n = 0
    for y0, y1 in zip(y_pre.T, y.T):
        if (y0 == y1).all():
            n += 1
    acc = n / y.shape[1]
    return round(acc,5)

#### b应该是(1,n)，即每层b相同
#### 也有人不这样做
#### 实际上肯定是层内b不同收敛效率高

In [70]:
def backward(a2, a1, y, x, w1, b1, w2, b2, z1, z2):
    dz2 = (a2 - y) / x.shape[1]
    dw2 = np.dot(dz2, a1.T) 
    db2 = np.sum(dz2, axis = 1).reshape(10,1)
    relu_ = np.array(1 * (z1 > 0))
    dz1 = np.dot(w2.T, dz2) * relu_
    dw1 = np.dot(dz1, x.T) 
    db1 = np.sum(dz1, axis = 1).reshape(10,1)
    return dw1, db1, dw2, db2

In [71]:
def update_params(dw1, db1, dw2, db2, w1, b1, w2, b2, alpha):
    w1 = w1 - alpha * dw1
    b1 = b1 - alpha * db1
    w2 = w2 - alpha * dw2
    b2 = b2 - alpha * db2
    return w1, b1, w2, b2

In [72]:
def gradient_descent(x, y, iter = 20, alpha = 0.1):
    w1, b1, w2, b2 = init_parameters()
    for i in range(iter):
        a1, a2, z1, z2 = forward(x, w1, b1, w2, b2)
        y_pre = predict(a2)
        if i % 10 == 0:
            print(f"iteration :{i}, accuracy: {accuracy(y_pre, y)}")
        dw1, db1, dw2, db2 = backward(a2, a1, y, x, w1, b1, w2, b2, z1, z2)
        w1, b1, w2, b2 = update_params(dw1, db1, dw2, db2, w1, b1, w2, b2, alpha)
    return w1, b1, w2, b2

In [73]:
w1, b1, w2, b2 = gradient_descent(train_x, train_y, iter = 500, alpha = 0.1);

iteration :0, accuracy: 0.09234
iteration :10, accuracy: 0.22648
iteration :20, accuracy: 0.29192
iteration :30, accuracy: 0.33382
iteration :40, accuracy: 0.36482
iteration :50, accuracy: 0.3973
iteration :60, accuracy: 0.43698
iteration :70, accuracy: 0.47722
iteration :80, accuracy: 0.52384
iteration :90, accuracy: 0.57558
iteration :100, accuracy: 0.61448
iteration :110, accuracy: 0.64626
iteration :120, accuracy: 0.67054
iteration :130, accuracy: 0.69182
iteration :140, accuracy: 0.70938
iteration :150, accuracy: 0.72472
iteration :160, accuracy: 0.73786
iteration :170, accuracy: 0.7492
iteration :180, accuracy: 0.75884
iteration :190, accuracy: 0.7679
iteration :200, accuracy: 0.77568
iteration :210, accuracy: 0.78266
iteration :220, accuracy: 0.78882
iteration :230, accuracy: 0.79478
iteration :240, accuracy: 0.79942
iteration :250, accuracy: 0.80376
iteration :260, accuracy: 0.80806
iteration :270, accuracy: 0.81216
iteration :280, accuracy: 0.81554
iteration :290, accuracy: 0.

In [74]:
a1, a2, z1, z2 = forward(test_x, w1, b1, w2, b2)
y_pre = predict(a2)
print(f"accuracy: {accuracy(y_pre, test_y)}")

accuracy: 0.8552
