In [1]:
%matplotlib inline

# Python实现神经网络
推荐资料：Michael Nielsen的[*Neural Networks and Deep Learning*](http://neuralnetworksanddeeplearning.com/)
<br>

In [2]:
# -*- coding: utf-8 -*-
# !/usr/bin/env python

from __future__ import print_function

In [3]:
import numpy as np
from collections import OrderedDict
import time

高级张量乘法

In [4]:
def tensorMul(tensor1, tensor2, axis):
    (a_extend, b_extend) = axis

    def extendTensor(tensor, index):
        shape = list(tensor.shape)
        index = len(shape) if index == -1 else index
        shape.insert(index, 1)
        return np.reshape(tensor, newshape=shape)

    tensor1 = extendTensor(tensor1, a_extend)
    tensor2 = extendTensor(tensor2, b_extend)
    batch_size = tensor1.shape[0]
    tensor = np.asarray([np.dot(tensor1[i], tensor2[i]) for i in xrange(batch_size)])
    return tensor

## 全相连网络层

In [5]:
class Dense:
    def __init__(self, input_size, output_size, bias_available=True, activation=None):
        self.mapping_size = (input_size, output_size,)
        self.input = None
        # self.weight = np.random.randn(input_size, output_size)
        self.weight = np.zeros(shape=(input_size, output_size))
        self.bias_available = bias_available
        if self.bias_available:
            self.bias = np.zeros(shape=(output_size))
        else:
            pass
        self.activation = activation
        self.output = None

    def forward(self, x):
        self.input = x
        self.output = np.matmul(self.input, self.weight)
        if self.bias_available:
            self.output += self.bias
        else:
            pass
        return self.activation(self.output)

    def backprop(self, gard, optimizer):
        # grad: gradient from behind layers
        # self.activation(self.output, diff=False): difference between y_activation and y
        dgrad_dy = gard * self.activation(self.output, diff=True)
        # weight
        delta_weight = tensorMul(self.input, dgrad_dy, axis=(-1, 1))
        delta_weight = self.average(delta_weight)
        delta_weight = optimizer(delta_weight)
        # previous delta
        next_grad = np.matmul(dgrad_dy, self.weight.T)
        next_grad = self.average(next_grad)
        # bias
        if self.bias_available:
            delta_bias = self.average(dgrad_dy)
            delta_bias = optimizer(delta_bias)
            return next_grad, delta_weight, delta_bias
        else:
            return next_grad, delta_weight, _

    def updata(self, delta_weight, delta_bias):
        self.weight += delta_weight
        if self.bias_available:
            self.bias += delta_bias

    def average(self, x, axis=0):
        return np.sum(x, axis=axis) / x.shape[0]

    def size(self):
        return self.mapping_size


## 激励函数

In [6]:

class Activation:
    def __init__(self):
        pass

    def relu(self, x, diff=False):
        if diff:
            pass
        else:
            return [i if i > 0 else 0 for i in x]

    def linear(self, x, diff=False):
        if diff:
            return np.ones_like(x)
        else:
            return x

    def sigmoid(self, x, diff=False):
        if diff:
            return self.sigmoid(x) * (1 - self.sigmoid(x))
        else:
            return 1.0 / (1 + np.exp(-x))

    def softmax(self, x, diff=False):
        if diff:
            pass
        else:
            x_exp = np.exp(x)
            x_exp_sum = np.sum(x_exp)
            return [i / x_exp_sum for i in x_exp]


## 优化器

In [7]:
class Optimizer:
    def __init__(self, learning_rate):
        self.learning_rate = learning_rate

    def SGD(self, grad, **kwargs):
        return -self.learning_rate * grad

    def Adam(self, grad, **kwargs):
        pass

    def Adagrad(self, grad, **kwargs):
        pass


## 损失函数

In [8]:
class CostFunction:
    def __init__(self):
        pass

    def MSE(self, y_true, y_pred, diff=False):
        assert y_pred.shape == y_true.shape
        if diff:
            # cost = 1 / 2 * (y - y_) ** 2
            # so, dcost = 2 * 1 / 2 * (y - y_) * (-1)
            return y_pred - y_true
        else:
            return np.mean(np.sum(np.power(y_true - y_pred, 2), axis=-1))

    def CrossEntropy(self, y_true, y_pred, diff=False):
        if diff:
            pass
        else:
            pass

## 高级模型

In [9]:
class Sequential:
    def __init__(self, learning_rate=1e-3):
        self.layer = OrderedDict()
        self.optimizer = Optimizer(learning_rate).SGD
        self.cost_function = CostFunction()
        self.batch_size = 1

    def __add__(self, layer, name=None):
        if name == None:
            self.layer['layer%02d' % (self.__len__())] = layer
        else:
            self.layer[name] = layer

    def __len__(self):
        return len(self.layer)

    def forward(self, x):
        for layer in self.layer.values():
            x = layer.forward(x)
        return x

    def predict(self, x):
        return self.forward(x)

    def evaluate(self, y_pred, y_true):
        y_pred = np.argmax(y_pred, axis=-1)
        y_true = np.argmax(y_true, axis=-1)
        return 1.0 * np.sum(y_true == y_pred) / y_pred.shape[0]

    def backprop(self, grad):
        for layer in reversed(self.layer.values()):
            grad, delta_weight, delta_bias = layer.backprop(grad, self.optimizer)
            layer.updata(delta_weight, delta_bias)

    def train(self, x, y, episode=1, parenthesis=1):
        history = {'cost': [], 'accuracy': []}
        start = time.time()  # timer
        for i in xrange(1, 1 + episode):
            cost_value = 0.0
            accuracy_value = 0.0
            batch_episode = len(x) // self.batch_size
            for j in xrange(batch_episode):
                batch_x = x[j * self.batch_size:(j + 1) * self.batch_size]
                batch_y = y[j * self.batch_size:(j + 1) * self.batch_size]
                batch_y_ = self.forward(batch_x)
                batch_cost_value = self.cost_function.MSE(y_true=batch_y, y_pred=batch_y_)
                batch_accuracy = self.evaluate(y_pred=batch_y_, y_true=batch_y)
                grad = self.cost_function.MSE(y_true=batch_y, y_pred=batch_y_, diff=True)
                self.backprop(grad)
                cost_value += batch_cost_value
                accuracy_value += batch_accuracy
                # print('Epoch %s / %s, sub-epoch %s / %s, cost is %s, accuracy is %s...' % \
                #       (i, episode, j, batch_episode, batch_cost_value, batch_accuracy))
            cost_value /= batch_episode
            accuracy_value /= batch_episode
            if i % parenthesis == 0:
                time_cost = time.time() - start
                print('Epoch %s / %s, cost is %.5f, accuracy is %.5f, time cost is %.5f. (parenthesis=%s)' % \
                      (i, episode, cost_value, accuracy_value, time_cost, parenthesis))
                start = time.time()  # timer
            history['cost'].append(cost_value)
            history['accuracy'].append(accuracy_value)
        return history

    def summary(self):
        print("\n" + "=" * 80)
        for name, layer in self.layer.items():
            input_size, output_size = layer.mapping_size
            if layer.bias_available:
                print('Linear Network', name, '\n{input: [Batch, %s] × weight: [%s, %s] + bias: [%s]-> [Batch, %s]}' % \
                      (input_size, input_size, output_size, output_size, output_size))
            else:
                print('Linear Network', name, '\n{input: [Batch, %s] × weight: [%s, %s] -> [Batch, %s]}' % \
                      (input_size, input_size, output_size, output_size))
        print("=" * 80 + "\n ")

    def save(self, filename):
        parameters = []
        for layer in self.layer.values():
            parameters.append([layer.weight, layer.bias])
        np.save(file=filename, arr=parameters)

    def load(self, filename):
        parameters = np.load(file=filename)
        for i, name in enumerate(self.layer.keys()):
            weight, bias = parameters[i]
            self.layer[name].weight = weight
            self.layer[name].bias = bias



## 参数

In [10]:
# hyper-parameters
features = 28 ** 2
hidden = 64
num_classes = 10

sigmoid = Activation().sigmoid
linear = Activation().linear

导入MNIST数据集

In [11]:
import cPickle
import gzip

def load_data():
    with gzip.open('data/mnist.pkl.gz', 'rb') as reader:
        training_data, validation_data, test_data = cPickle.load(reader)
        return (training_data, validation_data, test_data)

def onehot_encoding(x, num_classes=num_classes):
    size = [x.shape[0], num_classes]
    y = np.zeros(size)
    for i in xrange(size[0]):
        y[i, x[i]] = 1
    return y


(train_x, train_y), _, (test_x, test_y) = load_data()
train_y = onehot_encoding(train_y)
test_y = onehot_encoding(test_y)
print('training data is %s; test data is %s' % (len(train_x), len(test_x)))

training data is 50000; test data is 10000


In [12]:
# 定义模型
model = Sequential(learning_rate=1e-2)
model.batch_size = 50
model.__add__(Dense(features, num_classes, activation=sigmoid), name='linear1')

# 模型结构输出
model.summary()

# 模型训练
history = model.train(train_x, train_y, episode=100, parenthesis=10)

# 保存/读取模型
model.save(filename='model.npy')
model.load(filename='model.npy')

# 测试模型
print('Accuracy:', model.evaluate(y_pred=model.predict(test_x), y_true=test_y))


Linear Network linear1 
{input: [Batch, 784] × weight: [784, 10] + bias: [10]-> [Batch, 10]}
 
Epoch 10 / 100, cost is 0.31343, accuracy is 0.86226, time cost is 23.45271. (parenthesis=10)
Epoch 20 / 100, cost is 0.26475, accuracy is 0.87766, time cost is 23.23998. (parenthesis=10)
Epoch 30 / 100, cost is 0.24368, accuracy is 0.88416, time cost is 23.41176. (parenthesis=10)
Epoch 40 / 100, cost is 0.23100, accuracy is 0.88854, time cost is 23.32661. (parenthesis=10)
Epoch 50 / 100, cost is 0.22219, accuracy is 0.89186, time cost is 23.21651. (parenthesis=10)
Epoch 60 / 100, cost is 0.21557, accuracy is 0.89376, time cost is 23.17856. (parenthesis=10)
Epoch 70 / 100, cost is 0.21033, accuracy is 0.89578, time cost is 23.35666. (parenthesis=10)
Epoch 80 / 100, cost is 0.20604, accuracy is 0.89766, time cost is 23.42308. (parenthesis=10)
Epoch 90 / 100, cost is 0.20244, accuracy is 0.89878, time cost is 23.52733. (parenthesis=10)
Epoch 100 / 100, cost is 0.19936, accuracy is 0.89998, tim

推荐资料：<br>
[Stanford UFLDL课程，神经网络](http://deeplearning.stanford.edu/wiki/index.php/Neural_Networks)<br>
[Stanford UFLDL课程，backprogation算法](http://deeplearning.stanford.edu/wiki/index.php/Backpropagation_Algorithm)<br>
[Stanford UFLDL课程，backprogation算法](http://deeplearning.stanford.edu/wiki/index.php/Backpropagation_Algorithm)<br>
[Stanford UFLDL课程，梯度检验与高级优化](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)<br>
[Micheal Nielsen, *Neural Networks and Deep Learning*](http://neuralnetworksanddeeplearning.com/index.html)<br>