In [1]:
import json
import sys
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import gzip
import pickle
import random

In [2]:
# 定义交叉熵类
class CrossEntropyCost(object):
    
    @staticmethod
    def fn(a, y):
        """Return the cost associated with an output ``a`` and desired output
        ``y``.  Note that np.nan_to_num is used to ensure numerical
        stability.  In particular, if both ``a`` and ``y`` have a 1.0
        in the same slot, then the expression (1-y)*np.log(1-a)
        returns nan.  The np.nan_to_num ensures that that is converted
        to the correct value (0.0).

        """
        return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))

    @staticmethod
    def delta(z, a, y):
        """Return the error delta from the output layer.  Note that the
        parameter ``z`` is not used by the method.  It is included in
        the method's parameters in order to make the interface
        consistent with the delta method for other cost classes.

        """
        return (a-y)

In [3]:
# 定义二次代价类
class QuadraticCost(object):
    
    @staticmethod
    def fn(a, y):
        """Return the cost associated with an output ``a`` and desired output
        ``y``.

        """
        return 0.5*np.linalg.norm(a-y)**2

    @staticmethod
    def delta(z, a, y):
        """Return the error delta from the output layer."""
        return (a-y) * sigmoid_prime(z)

In [4]:
def vectorized_result(j):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    e = np.zeros((10, 1))
    e2 = e.astype(int)
    e2[j.astype(int)] = 1
    return e2


def backprop(x, y, biases, weights, num_layers, cost):
    """Return a tuple ``(nabla_b, nabla_w)`` representing the
    gradient for the cost function C_x.  ``nabla_b`` and
    ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
    to ``self.biases`` and ``self.weights``."""
    
    # 初始化空偏置值集合和空权重集合
    nabla_b = [np.zeros(b.shape) for b in biases]
    nabla_w = [np.zeros(w.shape) for w in weights]
    
    # feedforward前馈
    # x为单一训练样本
    activation = x
    activations = [x] # list to store all the activations, layer by layer
    # 存储所有z向量集合 z = (a * w + b)
    zs = [] # list to store all the z vectors, layer by layer
    
    # 用初始化的随机偏置集和权重集逐层进行计算
    for b, w in list(zip(biases, weights)):
        z = np.dot(w, activation)+b
        # 记录了神经网络中每层的计算结果的值
        zs.append(z)
        
        # 为了理解和感知器模型的相似性，假设z ≡ w ·x + b 是一个很大的正数。那么e−z≈ 0 而
        # σ(z) ≈1。即，当z = w ·x + b 很大并且为正，S型神经元的输出近似为1，正好和感知器一样。
        # 相反地，假设z = w ·x + b 是一个很大的负数。那么e−z→∞，σ(z) ≈0。所以当z = w ·x + b
        # 是一个很大的负数，S型神经元的行为也非常近似一个感知器。只有在w ·x + b 取中间值时，和感知器模型有比较大的偏离。
        
        # 计算每层激活值
        activation = sigmoid(z)
        # 记录每层神经元的激活值
        activations.append(activation)
        
    # backward pass 反向传播
    # cost.delta 计算activations[-1] - y，即计算结果差值
    # delta是最后一层计算的误差值
    delta = cost.delta(zs[-1], activations[-1], y)
    # 求偏置量的梯度
    nabla_b[-1] = delta
    # 求权重的梯度
    nabla_w[-1] = np.dot(delta, activations[-2].transpose())
    # Note that the variable l in the loop below is used a little
    # differently to the notation in Chapter 2 of the book.  Here,
    # l = 1 means the last layer of neurons, l = 2 is the
    # second-last layer, and so on.  It's a renumbering of the
    # scheme in the book, used here to take advantage of the fact
    # that Python can use negative indices in lists.
    
    # l = 1 表示最后一层神经元，l = 2 是倒数第二层神经元, 依此类推.
    for l in range(2, num_layers):
        z = zs[-l]
        sp = sigmoid_prime(z)
        delta = np.dot(weights[-l+1].transpose(), delta) * sp
        nabla_b[-l] = delta
        nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
    return (nabla_b, nabla_w)

#### Miscellaneous functions
def sigmoid(z):
    """The sigmoid function."""
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    return sigmoid(z)*(1-sigmoid(z))

def feedforward(a, biases, weights):
    """Return the output of the network if ``a`` is input."""
    for b, w in zip(biases, weights):
        a = sigmoid(np.dot(w, a)+b)
    return a


In [5]:
# load data
f = gzip.open('data_sets/mnist.pkl.gz', "rb")  
training_data, validation_data, test_data = pickle.load(f,encoding='iso-8859-1')  
f.close()  

# 训练数据集由shape(50000,784)转换成len=50000,shape(784,1)的列表
training_inputs = [np.reshape(x, (784, 1)) for x in training_data[0]]
# 此步骤将数据集对应结果由shape(50000,)转换成len=50000,shape(10,1)的列表，数组中的值为1/0
training_results = [vectorized_result(y) for y in training_data[1]]
# 此步骤将数据集与结果整合为一个数组，数组格式为len=50000的列表，每项为len=2的列表，第一项为数据集第一项shape(784,1)，第二项为结果标识shape(10,1)
vtraining_data = list(zip(training_inputs, training_results))

# 验证数据集
validation_inputs = [np.reshape(x, (784, 1)) for x in validation_data[0]]
validation_data = list(zip(validation_inputs, validation_data[1]))

# 测试数据集
test_inputs = [np.reshape(x, (784, 1)) for x in test_data[0]]
test_data = list(zip(test_inputs, test_data[1]))

# print(len(training_inputs))
# print(len(validation_inputs))
# print(len(test_inputs))
 
# 显示数据集图片
# img = training_inputs[0]
# img = validation_inputs[0]
# img = training_inputs[1]
# img = img.reshape(28,-1)
# print(type(img))

# 设置图形
# plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
# plt.rcParams['image.interpolation'] = 'nearest'
# plt.rcParams['image.cmap'] = 'gray'
# plt.imshow(img)

# 创建三层神经网络，第一层（输入层）784个神经元，第二层（隐藏层）30个神经元，第三层（输出层）10个神经元
sizes = [784,30,10]
num_layers = len(sizes)
cost=CrossEntropyCost

# biases = [np.random.randn(y, 1) for y in sizes[1:]]
# weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(sizes[:-1], sizes[1:])]
# 为第一层和第二层神经元设置偏置值，列表len=2，第一项为第一层偏置值shape(30,1)，第二项为第二层偏置值shape(10,1)，并填充均值为0，标准差为1的高斯分布随机数
biases = [np.random.randn(y, 1) for y in sizes[1:]]
# 为第一层和第二层神经元设置权重，列表len=2，第一项为第一层权重shape(30,784)，第二项为第二层权重shape(10,30)，并填充均值为0，标准差为1的高斯分布随机数
weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]

# print(sizes, num_layers, biases, weights)

# 训练总次数
epochs = 5
# 最每次训练小样本集大小
mini_batch_size = 10
# 学习率
eta = 3.0
# 规范化参数
lmbda = 0.1
evaluation_data=test_data
# 监控评估代价输出
monitor_evaluation_cost=True
# 监控评估真实值
monitor_evaluation_accuracy=True
# 监控训练代价输出
monitor_training_cost=True
# 监控训练真实值
monitor_training_accuracy=True

# 评价数据集不为空，则获取评价数据集数量
if evaluation_data:
    n_data = len(evaluation_data)
# 训练数据集数量
n = len(vtraining_data)
evaluation_cost, evaluation_accuracy = [], []
training_cost, training_accuracy = [], []

# 开始训练
for j in range(epochs):
    # epochss.append(epochs)
    # 打乱训练数据集顺序
    random.shuffle(vtraining_data)
    # 组装每次训练数据集
    mini_batches = [
        vtraining_data[k:k+mini_batch_size]
        for k in range(0, n, mini_batch_size)]
    # 开始训练
    for mini_batch in mini_batches:
        # print(mini_batch)
        
        # 初始化反向传播偏置值集合
        nabla_b = [np.zeros(b.shape) for b in biases]
        # 初始化反向传播权重集合
        nabla_w = [np.zeros(w.shape) for w in weights]
        
        for x, y in mini_batch:
            # 传入训练数据，累积每次计算偏移量
            delta_nabla_b, delta_nabla_w = backprop(x, y, biases, weights, num_layers, cost)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        
        # 解决过度拟合的办法：
        # 1、减少训练集合的数量
        # 2、权重衰减
        # 定义学习率计算累积偏移量的影响，并用当前偏移量减去影响实现梯度下降
        weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw for w, nw in zip(weights, nabla_w)]
        # 实践看来，做出这样的调整并不会对
        # 结果改变太多，所以，在某种程度上，对不对偏置进行规范化其实就是一种习惯了。然而，需要
        # 注意的是，有一个大的偏置并不会像大的权重那样会让神经元对输入太过敏感。所以我们不需
        # 要对大的偏置所带来的学习训练数据的噪声太过担心。同时，允许大的偏置能够让网络更加灵
        # 活——因为，大的偏置让神经元更加容易饱和，这有时候是我们所要达到的效果。所以，我们
        # 通常不会对偏置进行规范
        biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(biases, nabla_b)]
        
    print ("Epoch %s training complete" % j)
    
    # 监控训练损失
    if monitor_training_cost:
        Tcost = 0.0
        for x, y in vtraining_data:
            a = feedforward(x, biases, weights)
            # if convert: y = vectorized_result(y)
            
            # 交叉熵损失函数：实际输出（概率）与期望输出（概率）的距离，也就是交叉熵的值越小，两个概率分布就越接近。
            # np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))
            # 计算交叉熵值并累加求训练数据集的代价平均值
            Tcost += cost.fn(a, y)/len(vtraining_data)
        
        # 计算权重衰减值，lambda/2n * 权重的平方和
        Tcost += 0.5*(lmbda/len(vtraining_data))*sum(np.linalg.norm(w)**2 for w in weights)
        # 记录训练集合的权重损失值
        training_cost.append(Tcost)
        
        print ("Cost on training data: {}".format(Tcost))
    if monitor_training_accuracy:
        # 用训练后的参数前馈训练数据获得结果
        results = [(np.argmax(feedforward(x, biases, weights)), np.argmax(y)) for (x, y) in vtraining_data]
        # 对比结果
        accuracy1 = sum(int(x == y) for (x, y) in results)
        training_accuracy.append(accuracy1)
        print ("Accuracy on training data: : {} / {}".format(accuracy1, n))
    if monitor_evaluation_cost:
        Tcost2 = 0.0
        for c, v in evaluation_data:
            o = feedforward(c, biases, weights)
            v = vectorized_result(v)
            Tcost2 += cost.fn(o, v)/len(evaluation_data)
        Tcost2 += 0.5*(lmbda/len(evaluation_data))*sum(np.linalg.norm(w)**2 for w in weights)
        evaluation_cost.append(Tcost2)
        print ("Cost on evaluation data: {}".format(Tcost2))
    if monitor_evaluation_accuracy:
        results = [(np.argmax(feedforward(x, biases, weights)), y) for (x, y) in evaluation_data]
        accuracy = sum(int(x == y) for (x, y) in results)
        evaluation_accuracy.append(accuracy)
        print ("Accuracy on evaluation data: {} / {}".format(accuracy, n_data))
    



Epoch 0 training complete
Cost on training data: 0.7908196016850911
Accuracy on training data: : 43927 / 50000
Cost on evaluation data: 0.9243150918820529
Accuracy on evaluation data: 8824 / 10000
Epoch 1 training complete
Cost on training data: 0.6501215336870318
Accuracy on training data: : 45504 / 50000
Cost on evaluation data: 0.8396558155714646
Accuracy on evaluation data: 9099 / 10000
Epoch 2 training complete
Cost on training data: 0.6996304236443324
Accuracy on training data: : 45352 / 50000
Cost on evaluation data: 0.946881925627924
Accuracy on evaluation data: 9040 / 10000
Epoch 3 training complete
Cost on training data: 0.5972582326726212
Accuracy on training data: : 45673 / 50000
Cost on evaluation data: 0.863305249183705
Accuracy on evaluation data: 9115 / 10000
Epoch 4 training complete
Cost on training data: 0.5062093225616258
Accuracy on training data: : 46657 / 50000
Cost on evaluation data: 0.8076429407178052
Accuracy on evaluation data: 9299 / 10000
