## 准备数据

In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers, datasets

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # or any {'0', '1', '2'}

def mnist_dataset():
    (x, y), (x_test, y_test) = datasets.mnist.load_data()
    #normalize
    x = x/255.0
    x_test = x_test/255.0
    
    return (x, y), (x_test, y_test)

## Demo numpy based auto differentiation

In [2]:
import numpy as np

class Matmul:
    def __init__(self):
        self.mem = {}
        
    def forward(self, x, W):
        h = np.matmul(x, W)
        self.mem={'x': x, 'W':W}
        return h
    
    def backward(self, grad_y):
        '''
        x: shape(N, d)
        w: shape(d, d')
        grad_y: shape(N, d')
        '''
        x = self.mem['x']
        W = self.mem['W']
        
        ####################
        '''计算矩阵乘法的对应的梯度'''
        # 矩阵求导，Y=WX, dY/dW = X^T，一定要记得是X的转置  
        grad_x = np.matmul(grad_y, W.T)
        grad_W = np.matmul(x.T, grad_y)
        ####################
        return grad_x, grad_W


class Relu:
    def __init__(self):
        self.mem = {}
        
    def forward(self, x):
        self.mem['x']=x
        return np.where(x > 0, x, np.zeros_like(x))
    
    def backward(self, grad_y):
        '''
        grad_y: same shape as x
        '''
        ####################
        '''计算relu 激活函数对应的梯度'''
        x = self.mem['x']

        # relu函数的导数，x>0时，导数为1*grad_y，x<=0时，导数为0
        grad_x = (x > 0).astype(np.float32) * grad_y
        ####################
        return grad_x
    


class Softmax:
    '''
    softmax over last dimention
    '''
    def __init__(self):
        self.epsilon = 1e-12
        self.mem = {}
        
    def forward(self, x):
        '''
        x: shape(N, c)
        '''
        x_exp = np.exp(x)
        partition = np.sum(x_exp, axis=1, keepdims=True)
        out = x_exp/(partition+self.epsilon)
        
        self.mem['out'] = out
        self.mem['x_exp'] = x_exp
        return out
    
    def backward(self, grad_y):
        '''
        grad_y: same shape as x
        '''
        s = self.mem['out']
        sisj = np.matmul(np.expand_dims(s,axis=2), np.expand_dims(s, axis=1)) # (N, c, c)
        g_y_exp = np.expand_dims(grad_y, axis=1)
        tmp = np.matmul(g_y_exp, sisj) #(N, 1, c)
        tmp = np.squeeze(tmp, axis=1)
        tmp = -tmp+grad_y*s 
        return tmp
    
class Log:
    '''
    softmax over last dimention
    '''
    def __init__(self):
        self.epsilon = 1e-12
        self.mem = {}
        
    def forward(self, x):
        '''
        x: shape(N, c)
        '''
        # 一般我们使用的log都是以自然对数为底数的log
        out = np.log(np.clip(x + self.epsilon, 1e-12, None))

        
        self.mem['x'] = x
        return out
    
    def backward(self, grad_y):
        '''
        grad_y: same shape as x
        '''
        x = self.mem['x']
        
        return 1./(x+1e-12) * grad_y
    


## Gradient check

In [3]:
import tensorflow as tf

x = np.random.normal(size=[5, 6])
W = np.random.normal(size=[6, 4])
aa = Matmul()
out = aa.forward(x, W) # shape(5, 4)
grad = aa.backward(np.ones_like(out))
print (grad)

with tf.GradientTape() as tape:
    x, W = tf.constant(x), tf.constant(W)
    tape.watch(x)
    y = tf.matmul(x, W)
    loss = tf.reduce_sum(y)
    grads = tape.gradient(loss, x)
    print (grads)


(array([[-2.4636022 ,  1.80103617,  1.99570916, -0.38092546,  1.71228433,
        -0.54229294],
       [-2.4636022 ,  1.80103617,  1.99570916, -0.38092546,  1.71228433,
        -0.54229294],
       [-2.4636022 ,  1.80103617,  1.99570916, -0.38092546,  1.71228433,
        -0.54229294],
       [-2.4636022 ,  1.80103617,  1.99570916, -0.38092546,  1.71228433,
        -0.54229294],
       [-2.4636022 ,  1.80103617,  1.99570916, -0.38092546,  1.71228433,
        -0.54229294]]), array([[-2.88993631, -2.88993631, -2.88993631, -2.88993631],
       [-3.10542765, -3.10542765, -3.10542765, -3.10542765],
       [ 0.61375094,  0.61375094,  0.61375094,  0.61375094],
       [-4.2787534 , -4.2787534 , -4.2787534 , -4.2787534 ],
       [-0.14273893, -0.14273893, -0.14273893, -0.14273893],
       [ 2.03290746,  2.03290746,  2.03290746,  2.03290746]]))
tf.Tensor(
[[-2.4636022   1.80103617  1.99570916 -0.38092546  1.71228433 -0.54229294]
 [-2.4636022   1.80103617  1.99570916 -0.38092546  1.71228433 -0.542

In [4]:

import tensorflow as tf

x = np.random.normal(size=[5, 6])
aa = Relu()
out = aa.forward(x) # shape(5, 4)
grad = aa.backward(np.ones_like(out))
print (grad)

with tf.GradientTape() as tape:
    x= tf.constant(x)
    tape.watch(x)
    y = tf.nn.relu(x)
    loss = tf.reduce_sum(y)
    grads = tape.gradient(loss, x)
    print (grads)


[[1. 1. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1.]
 [1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1.]]
tf.Tensor(
[[1. 1. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1.]
 [1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1.]], shape=(5, 6), dtype=float64)


In [5]:

import tensorflow as tf
x = np.random.normal(size=[5, 6], scale=5.0, loc=1)
label = np.zeros_like(x)
label[0, 1]=1.
label[1, 0]=1
label[1, 1]=1
label[2, 3]=1
label[3, 5]=1
label[4, 0]=1
print(label)
aa = Softmax()
out = aa.forward(x) # shape(5, 6)
grad = aa.backward(label)
print (grad)

with tf.GradientTape() as tape:
    x= tf.constant(x)
    tape.watch(x)
    y = tf.nn.softmax(x)
    loss = tf.reduce_sum(y*label)
    grads = tape.gradient(loss, x)
    print (grads)


[[0. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]]
[[-2.74965356e-10  5.81656649e-05 -1.82813294e-08 -5.81212343e-05
  -2.58719258e-08 -2.44474335e-12]
 [ 5.12497657e-02  1.18279295e-01 -1.82093117e-04 -1.68685691e-01
  -6.60978876e-04 -2.97057201e-07]
 [-5.45729493e-04 -3.06407696e-05 -5.68152229e-07  8.93217131e-04
  -3.62346212e-05 -2.80044095e-04]
 [-1.64026596e-12 -1.96655794e-08 -4.15381373e-16 -4.41349034e-12
  -2.42612198e-08  4.39328534e-08]
 [ 2.35008010e-04 -1.75962244e-04 -4.26936055e-09 -1.35149532e-07
  -7.96914128e-06 -5.09372062e-05]]
tf.Tensor(
[[-2.74965356e-10  5.81656649e-05 -1.82813294e-08 -5.81212343e-05
  -2.58719258e-08 -2.44474335e-12]
 [ 5.12497657e-02  1.18279295e-01 -1.82093117e-04 -1.68685691e-01
  -6.60978876e-04 -2.97057201e-07]
 [-5.45729493e-04 -3.06407696e-05 -5.68152229e-07  8.93217131e-04
  -3.62346212e-05 -2.80044095e-04]
 [-1.64026596e-12 -1.96655794e-08 -4.15381373e-16 -4.41349034e-12
  -2.426

In [6]:

import tensorflow as tf

x = np.random.normal(size=[5, 6])
aa = Log()
out = aa.forward(x) # shape(5, 4)
grad = aa.backward(label)
print (grad)

with tf.GradientTape() as tape:
    x= tf.constant(x)
    tape.watch(x)
    y = tf.math.log(x)
    loss = tf.reduce_sum(y*label)
    grads = tape.gradient(loss, x)
    print (grads)



[[ -0.          -1.72938047   0.           0.           0.
   -0.        ]
 [  2.1304925    0.47598226   0.           0.           0.
   -0.        ]
 [  0.          -0.           0.          -0.64248564   0.
   -0.        ]
 [ -0.           0.          -0.           0.          -0.
   -2.05678731]
 [-12.23128757  -0.          -0.           0.           0.
   -0.        ]]
tf.Tensor(
[[ -0.          -1.72938047   0.           0.           0.
   -0.        ]
 [  2.1304925    0.47598226   0.           0.           0.
   -0.        ]
 [  0.          -0.           0.          -0.64248564   0.
   -0.        ]
 [ -0.           0.          -0.           0.          -0.
   -2.05678731]
 [-12.23128757  -0.          -0.           0.           0.
   -0.        ]], shape=(5, 6), dtype=float64)


In [7]:
import tensorflow as tf

x = np.random.normal(size=[5, 6])
W = np.random.normal(size=[6, 4])
aa = Matmul()
out = aa.forward(x, W) # shape(5, 4)
grad = aa.backward(np.ones_like(out))
print (grad)

with tf.GradientTape() as tape:
    x, W = tf.constant(x), tf.constant(W)
    tape.watch(x)
    y = tf.matmul(x, W)
    loss = tf.reduce_sum(y)
    grads = tape.gradient(loss, x)
    print (grads)


(array([[-2.04200585,  2.69909189,  2.03763021, -2.65600201, -1.39219717,
         2.41633745],
       [-2.04200585,  2.69909189,  2.03763021, -2.65600201, -1.39219717,
         2.41633745],
       [-2.04200585,  2.69909189,  2.03763021, -2.65600201, -1.39219717,
         2.41633745],
       [-2.04200585,  2.69909189,  2.03763021, -2.65600201, -1.39219717,
         2.41633745],
       [-2.04200585,  2.69909189,  2.03763021, -2.65600201, -1.39219717,
         2.41633745]]), array([[-0.28465906, -0.28465906, -0.28465906, -0.28465906],
       [-3.10877052, -3.10877052, -3.10877052, -3.10877052],
       [-0.92527423, -0.92527423, -0.92527423, -0.92527423],
       [ 0.62070863,  0.62070863,  0.62070863,  0.62070863],
       [-1.0259893 , -1.0259893 , -1.0259893 , -1.0259893 ],
       [ 3.57369924,  3.57369924,  3.57369924,  3.57369924]]))
tf.Tensor(
[[-2.04200585  2.69909189  2.03763021 -2.65600201 -1.39219717  2.41633745]
 [-2.04200585  2.69909189  2.03763021 -2.65600201 -1.39219717  2.416

In [8]:

import tensorflow as tf

x = np.random.normal(size=[5, 6])
aa = Relu()
out = aa.forward(x) # shape(5, 4)
grad = aa.backward(np.ones_like(out))
print (grad)

with tf.GradientTape() as tape:
    x= tf.constant(x)
    tape.watch(x)
    y = tf.nn.relu(x)
    loss = tf.reduce_sum(y)
    grads = tape.gradient(loss, x)
    print (grads)



[[0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0.]
 [0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 1. 1. 0.]
 [0. 1. 0. 0. 0. 0.]]
tf.Tensor(
[[0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1. 0.]
 [0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 1. 1. 0.]
 [0. 1. 0. 0. 0. 0.]], shape=(5, 6), dtype=float64)


In [9]:

import tensorflow as tf
x = np.random.normal(size=[5, 6], scale=5.0, loc=1)
label = np.zeros_like(x)
label[0, 1]=1.
label[1, 0]=1
label[1, 1]=1
label[2, 3]=1
label[3, 5]=1
label[4, 0]=1
print(label)
aa = Softmax()
out = aa.forward(x) # shape(5, 6)
grad = aa.backward(label)
print (grad)

with tf.GradientTape() as tape:
    x= tf.constant(x)
    tape.watch(x)
    y = tf.nn.softmax(x)
    loss = tf.reduce_sum(y*label)
    grads = tape.gradient(loss, x)
    print (grads)



[[0. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]]
[[-3.27499895e-03  8.12916363e-03 -3.91626803e-03 -9.07217181e-04
  -3.06777126e-05 -1.75213264e-09]
 [ 5.10753278e-08  6.21793119e-05 -9.57894200e-08 -5.73405052e-05
  -4.73906468e-06 -5.50278583e-08]
 [-4.16597328e-06 -1.72809315e-11 -7.10652935e-16  4.16599426e-06
  -2.67724614e-15 -3.69944638e-12]
 [-5.27632936e-06 -1.98331179e-05 -4.14196554e-07 -1.55420439e-09
  -4.29242674e-06  2.98176248e-05]
 [ 2.48603403e-01 -4.53264001e-02 -4.95703493e-06 -3.38321934e-05
  -1.94963597e-01 -8.27461690e-03]]
tf.Tensor(
[[-3.27499895e-03  8.12916363e-03 -3.91626803e-03 -9.07217181e-04
  -3.06777126e-05 -1.75213264e-09]
 [ 5.10753278e-08  6.21793119e-05 -9.57894200e-08 -5.73405052e-05
  -4.73906468e-06 -5.50278583e-08]
 [-4.16597328e-06 -1.72809315e-11 -7.10652935e-16  4.16599426e-06
  -2.67724614e-15 -3.69944638e-12]
 [-5.27632936e-06 -1.98331179e-05 -4.14196554e-07 -1.55420439e-09
  -4.292

In [10]:

import tensorflow as tf

x = np.random.normal(size=[5, 6])
aa = Log()
out = aa.forward(x) # shape(5, 4)
grad = aa.backward(label)
print (grad)

with tf.GradientTape() as tape:
    x= tf.constant(x)
    tape.watch(x)
    y = tf.math.log(x)
    loss = tf.reduce_sum(y*label)
    grads = tape.gradient(loss, x)
    print (grads)



[[-0.          0.83788002 -0.          0.         -0.         -0.        ]
 [ 0.57278804 -1.03727467  0.          0.          0.          0.        ]
 [-0.          0.          0.          0.91012273 -0.          0.        ]
 [-0.         -0.         -0.          0.         -0.         -0.46430231]
 [ 1.21136885  0.          0.          0.         -0.          0.        ]]
tf.Tensor(
[[-0.          0.83788002 -0.          0.         -0.         -0.        ]
 [ 0.57278804 -1.03727467  0.          0.          0.          0.        ]
 [-0.          0.          0.          0.91012273 -0.          0.        ]
 [-0.         -0.         -0.          0.         -0.         -0.46430231]
 [ 1.21136885  0.          0.          0.         -0.          0.        ]], shape=(5, 6), dtype=float64)


# Final Gradient Check

In [11]:
import tensorflow as tf

label = np.zeros_like(x)
label[0, 1]=1.
label[1, 0]=1
label[2, 3]=1
label[3, 5]=1
label[4, 0]=1

x = np.random.normal(size=[5, 6])
W1 = np.random.normal(size=[6, 5])
W2 = np.random.normal(size=[5, 6])

mul_h1 = Matmul()
mul_h2 = Matmul()
relu = Relu()
softmax = Softmax()
log = Log()

h1 = mul_h1.forward(x, W1) # shape(5, 4)
h1_relu = relu.forward(h1)
h2 = mul_h2.forward(h1_relu, W2)
h2_soft = softmax.forward(h2)
h2_log = log.forward(h2_soft)


h2_log_grad = log.backward(label)
h2_soft_grad = softmax.backward(h2_log_grad)
h2_grad, W2_grad = mul_h2.backward(h2_soft_grad)
h1_relu_grad = relu.backward(h2_grad)
h1_grad, W1_grad = mul_h1.backward(h1_relu_grad)

print(h2_log_grad)
print('--'*20)
# print(W2_grad)

with tf.GradientTape() as tape:
    x, W1, W2, label = tf.constant(x), tf.constant(W1), tf.constant(W2), tf.constant(label)
    tape.watch(W1)
    tape.watch(W2)
    h1 = tf.matmul(x, W1)
    h1_relu = tf.nn.relu(h1)
    h2 = tf.matmul(h1_relu, W2)
    prob = tf.nn.softmax(h2)
    log_prob = tf.math.log(prob)
    loss = tf.reduce_sum(label * log_prob)
    grads = tape.gradient(loss, [prob])
    print (grads[0].numpy())

[[0.00000000e+00 1.23913706e+03 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.43162321e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 2.18357548e+05
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 4.04823606e+04]
 [2.73320031e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]
----------------------------------------
[[0.00000000e+00 1.23913706e+03 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.43162321e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 2.18357596e+05
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 4.04823622e+04]
 [2.73320031e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


## 建立模型

In [12]:
class myModel:
    def __init__(self):
        
        self.W1 = np.random.normal(size=[28*28+1, 100])
        self.W2 = np.random.normal(size=[100, 10])
        
        self.mul_h1 = Matmul()
        self.mul_h2 = Matmul()
        self.relu = Relu()
        self.softmax = Softmax()
        self.log = Log()
        
        
    def forward(self, x):
        x = x.reshape(-1, 28*28)
        bias = np.ones(shape=[x.shape[0], 1])
        x = np.concatenate([x, bias], axis=1)
        
        self.h1 = self.mul_h1.forward(x, self.W1) # shape(5, 4)
        self.h1_relu = self.relu.forward(self.h1)
        self.h2 = self.mul_h2.forward(self.h1_relu, self.W2)
        self.h2_soft = self.softmax.forward(self.h2)
        self.h2_log = self.log.forward(self.h2_soft)
            
    def backward(self, label):
        self.h2_log_grad = self.log.backward(-label)
        self.h2_soft_grad = self.softmax.backward(self.h2_log_grad)
        self.h2_grad, self.W2_grad = self.mul_h2.backward(self.h2_soft_grad)
        self.h1_relu_grad = self.relu.backward(self.h2_grad)
        self.h1_grad, self.W1_grad = self.mul_h1.backward(self.h1_relu_grad)
        
model = myModel()


## 计算 loss

In [13]:
def compute_loss(log_prob, labels):
     return np.mean(np.sum(-log_prob*labels, axis=1))
    

def compute_accuracy(log_prob, labels):
    predictions = np.argmax(log_prob, axis=1)
    truth = np.argmax(labels, axis=1)
    return np.mean(predictions==truth)

def train_one_step(model, x, y):
    model.forward(x)
    model.backward(y)
    model.W1 -= 1e-5* model.W1_grad
    model.W2 -= 1e-5* model.W2_grad
    loss = compute_loss(model.h2_log, y)
    accuracy = compute_accuracy(model.h2_log, y)
    return loss, accuracy

def test(model, x, y):
    model.forward(x)
    loss = compute_loss(model.h2_log, y)
    accuracy = compute_accuracy(model.h2_log, y)
    return loss, accuracy

## 实际训练

In [16]:
train_data, test_data = mnist_dataset()
train_label = np.zeros(shape=[train_data[0].shape[0], 10])
test_label = np.zeros(shape=[test_data[0].shape[0], 10])
train_label[np.arange(train_data[0].shape[0]), np.array(train_data[1])] = 1.
test_label[np.arange(test_data[0].shape[0]), np.array(test_data[1])] = 1.

for epoch in range(50):
    loss, accuracy = train_one_step(model, train_data[0], train_label)
    print('epoch', epoch, ': loss', loss, '; accuracy', accuracy)
loss, accuracy = test(model, test_data[0], test_label)

print('test loss', loss, '; accuracy', accuracy)

epoch 0 : loss 5.388262811579957 ; accuracy 0.7609666666666667
epoch 1 : loss 5.386883093160101 ; accuracy 0.7617666666666667
epoch 2 : loss 5.402465284092803 ; accuracy 0.7603333333333333
epoch 3 : loss 5.393351168381061 ; accuracy 0.7616833333333334
epoch 4 : loss 5.412196987505331 ; accuracy 0.76035
epoch 5 : loss 5.395024497830504 ; accuracy 0.7616
epoch 6 : loss 5.410223664120058 ; accuracy 0.7598166666666667
epoch 7 : loss 5.38128559378601 ; accuracy 0.7624666666666666
epoch 8 : loss 5.383882364628721 ; accuracy 0.7610166666666667
epoch 9 : loss 5.341240117149945 ; accuracy 0.7643666666666666
epoch 10 : loss 5.3278874927143205 ; accuracy 0.7638166666666667
epoch 11 : loss 5.280329354121197 ; accuracy 0.767
epoch 12 : loss 5.252856342101264 ; accuracy 0.7675833333333333
epoch 13 : loss 5.210260919176116 ; accuracy 0.7706333333333333
epoch 14 : loss 5.187510274673751 ; accuracy 0.77095
epoch 15 : loss 5.15782066191714 ; accuracy 0.77335
epoch 16 : loss 5.140615965386487 ; accuracy 