In [1]:
%matplotlib inline

递归神经网络 - Recurrent Neural Network
====
>Python2.7 + Pytorch 1.2.0 backened
>
>text

In [2]:
# -*- coding: utf-8 -*-
# !/usr/bin/env python
'''
@author: deep learning textbook of whut
@date: 2017-10-31
'''
from __future__ import print_function

In [3]:
sentence = """
Deep learning (also known as deep structured learning or hierarchical learning)
is part of a broader family of machine learning methods based on learning data
representations, as opposed to task-specific algorithms. Learning can be supervised,
semi-supervised or unsupervised. Deep learning models are loosely related to information
processing and communication patterns in a biological nervous system, such as neural
coding that attempts to define a relationship between various stimuli and associated
neuronal responses in the brain. Deep learning architectures such as deep neural
networks, deep belief networks and recurrent neural networks have been applied to
fields including computer vision, speech recognition, natural language processing,
audio recognition, social network filtering, machine translation, bioinformatics
and drug design,[5] where they have produced results comparable to and in some
cases superior[6] to human experts.
""".split()
# from wikipedia https://en.wikipedia.org/wiki/Deep_learning

vocab = set(sentence)
word2ind = {word: i for i, word in enumerate(vocab)}
ind2word = {i: word for i, word in enumerate(vocab)}

# hyper-parameter
input_timesteps = 2
output_timesteps = 1
vocab_size = len(vocab)
embedding_size = 100

hidden_size = 60
layers_num = 2
training_epochs = 10000

In [4]:
data_num = len(sentence) - input_timesteps
x_data = [[word2ind[ch] for ch in sentence[i:i + input_timesteps]]
          for i in xrange(len(sentence) - input_timesteps)]
y_data = [[word2ind[sentence[i]]] for i in xrange(input_timesteps, len(sentence))]

In [5]:
import tensorflow as tf
X = tf.placeholder(dtype=tf.int32, shape=[None, input_timesteps])
Y = tf.placeholder(dtype=tf.int32, shape=[None, output_timesteps])

onehot_encoding = lambda tensor: tf.one_hot(tensor, depth=vocab_size, axis=-1)
output_tensor = onehot_encoding(Y)

In [None]:
embedding_layer = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embedding_layer, X)

## RNN中的Dropout
link: https://stackoverflow.com/questions/45917464/tensorflow-whats-the-difference-between-tf-nn-dropout-and-tf-contrib-rnn-dropo<br>
tensorflow中有两种Dropout手段：<br>
1.`tf.nn.droupout`：以上一个网络的输出的部分作为下一层网络的输入。适用于一切网络。<br>
2.`tf.contrib.rnn.DropoutWrapper`：在RNN cell内部实现dropout，可以控制RNN网络的输入和输出dropout。只适用于RNN内部。

## TensorFlow中创建多层RNN
`tensorflow`中有2种方法可以实现多层RNN：<br>
### 1.利用`rnn.MultiRNNCell`和`rnn.static_rnn`/`tf.nn.static_rnn`/`tf.nn.dynamic_rnn`的组合实现<br>
#### i).`tf.nn.dynamic_rnn`<br>
    不需要拆分<br>
``
cell = rnn.MultiRNNCell([rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob) \
                             for _ in xrange(num_layers)])
outputs, state = tf.nn.dynamic_rnn(cell, x, dtype=tf.float32)
``
<br>
#### ii).`rnn.static_rnn` <=> `tf.nn.static_rnn`<br>
需要拆分<br>
``
x = tf.unstack(x, timesteps, 1)
cell = rnn.MultiRNNCell([rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob) \
                             for _ in xrange(num_layers)])
outputs, state = tf.nn.static_rnn/rnn.static_rnn(cell, x, dtype=tf.float32)
outputs = tf.concat(outputs, axis=-1)
``<br>
### 2.通过`tf.variable_scope`循环模拟多层RNN<br>
#### i).`tf.nn.dynamic_rnn`<br>
不需要拆分<br>
``
for _ in xrange(num_layers):
    with tf.variable_scope(None, default_name="rnn"):
        x, state = tf.nn.dynamic_rnn(
            rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob),
            inputs=x, dtype=tf.float32)
outputs = x
``<br>
#### ii).`rnn.static_rnn` <=> `tf.nn.static_rnn`<br>
``
x = tf.unstack(x, timesteps, 1)
for _ in xrange(num_layers):
    with tf.variable_scope(None, default_name="rnn"):
        x, state = tf.nn.static_rnn(
            rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob),
            inputs=x, dtype=tf.float32)
outputs = tf.concat(x, axis=-1)
``

## TensorFlow中创建多层RNN和多层BiRNN

BiRNN不能像RNN那样灵活，需要控制输入和输出的流程，代码写起来比较冗长；不能直接使用for循环BiRNN，需要使用函数实现多层BiRNN<br>
tensorflow中biRNN共有5个接口:<br>
### 1.tensorflow.contrib.rnn.stack_bidirectional_rnn(cells_fw, cells_bw, ...)
需要`tf.unstack`，将`[batch, timestep, length]`的`timestep`拆分为`list`<br>
`cells_fw`, `cells_bw`必须为`list`，`list`的长度为RNN网络层数<br>
``
x = tf.unstack(x, timesteps, 1)
cell_fw = [rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob) \
           for _ in xrange(num_layers)]
cell_bw = [rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob) \
           for _ in xrange(num_layers)]
x = tf.unstack(x, timesteps, 1)
outputs, state_fw, state_bw = rnn.stack_bidirectional_rnn(cell_fw, cell_bw, inputs=x, dtype=tf.float32)
outputs = tf.stack(outputs, axis=1)
``
### 2.tensorflow.contrib.rnn.stack_bidirectional_dynamic_rnn
不需要拆分<br>
``
cell_fw = [rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob) \
           for _ in xrange(num_layers)]
cell_bw = [rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob) \
           for _ in xrange(num_layers)]
outputs, state_fw, state_bw = rnn.stack_bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs=x, dtype=tf.float32)
``
### 3.tf.nn.bidirectional_dynamic_rnn
不需要拆分；通过`tf.variable_scope`循环模拟多层RNN<br>
``
for _ in xrange(num_layers):
    with tf.variable_scope(None, default_name="bidirectional-rnn"):
        cell_fw = rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob)
        cell_bw = rnn.DropoutWrapper(cell(units, activation=activation), output_keep_prob=dropout_prob)
        x, state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs=x, dtype=tf.float32)
    x = tf.concat(x, axis=-1)
outputs = x
``
### 4.tensorflow.contrib.rnn.static_bidirectional_rnn = 5.tf.nn.static_bidirectional_rnn
需要拆分；通过`tf.variable_scope`循环模拟多层RNN<br>
``
x = tf.unstack(x, timesteps, 1)
for _ in xrange(num_layers):
    with tf.variable_scope(None, default_name="bidirectional-rnn"):
        cell_fw = rnn.DropoutWrapper(cell(units, activation=activation))
        cell_bw = rnn.DropoutWrapper(cell(units, activation=activation))
        x, state_fw, state_bw = rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs=x, dtype=tf.float32)
outputs = tf.stack(x, axis=1)
``

In [6]:
from tensorflow.contrib import rnn


def RNN(x, num_hidden,
        cell_type=rnn.BasicLSTMCell,
        activation=tf.nn.relu,
        dropout_prob=1.0,
        num_layers=1):
    assert cell_type in [rnn.BasicLSTMCell, rnn.BasicRNNCell, rnn.GRUCell], \
        'RNN cell is wrong, must be in "rnn.BasicLSTMCell, rnn.BasicRNNCell, rnn.GRUCell", but it is %s.' % (cell_type)
    assert type(num_layers) == int and num_layers >= 1
    assert 0.0 < dropout_prob <= 1.0

    # RNN
    def mRNN(x, units, cell=cell_type, activation=activation, num_layers=num_layers, dropout_prob=dropout_prob):
        pass

    # BiRNN
    def mBiRNN(x, units, cell=cell_type, activation=activation, num_layers=num_layers, dropout_prob=dropout_prob):
        pass

    cell_fw = [rnn.DropoutWrapper(cell_type(num_hidden, activation=activation), output_keep_prob=dropout_prob) \
               for _ in xrange(num_layers)]
    cell_bw = [rnn.DropoutWrapper(cell_type(num_hidden, activation=activation), output_keep_prob=dropout_prob) \
               for _ in xrange(num_layers)]
    outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs=x, dtype=tf.float32)

    return outputs

mLSTM = RNN(embed, hidden_size, dropout_prob=0.8, num_layers=2)
mLSTM = tf.reshape(mLSTM, [-1, output_timesteps, input_timesteps * hidden_size * 2])
fc1 = tf.layers.dense(inputs=mLSTM, units=vocab_size, activation=tf.nn.softmax)
y_pred = fc1
y_pred_max = tf.argmax(y_pred, axis=-1)

loss_op = tf.losses.softmax_cross_entropy(output_tensor, y_pred)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-2).minimize(loss_op)

In [7]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    for i in xrange(1, 1 + training_epochs):
        _, cost = session.run([optimizer, loss_op],
                              feed_dict={X: x_data, Y: y_data})
        if i % 1000 == 0:
            print('Epoch %s / %s, training cost: %s' % (i, training_epochs, cost))

    context_idxs = [word2ind['D'], word2ind['e']]
    logue = context_idxs
    for i in xrange(data_num):
        y_ = y_pred_max.eval({X: [context_idxs], Y: y_data})[0, 0]
        logue.append(y_)
        context_idxs = logue[-2:]

    sentence = ' '.join(sentence)
    pred_sentence = ' '.join([ind2word[i] for i in logue])

    import editdistance

    print('Distance between these two sentences is %s' % (editdistance.eval(sentence, pred_sentence)))
    print("\033[1;31;40m %s \033[0m" % (sentence))
    print(pred_sentence)

Epoch 100 / 10000, training cost: 3.4513
Epoch 200 / 10000, training cost: 3.3945
Epoch 300 / 10000, training cost: 3.36933
Epoch 400 / 10000, training cost: 3.36965
Epoch 500 / 10000, training cost: 3.3688
Epoch 600 / 10000, training cost: 3.3538
Epoch 700 / 10000, training cost: 3.35243
Epoch 800 / 10000, training cost: 3.34151
Epoch 900 / 10000, training cost: 3.33744
Epoch 1000 / 10000, training cost: 3.33535
Epoch 1100 / 10000, training cost: 3.33682
Epoch 1200 / 10000, training cost: 3.33514
Epoch 1300 / 10000, training cost: 3.33557
Epoch 1400 / 10000, training cost: 3.33478
Epoch 1500 / 10000, training cost: 3.33454
Epoch 1600 / 10000, training cost: 3.33479
Epoch 1700 / 10000, training cost: 3.33421
Epoch 1800 / 10000, training cost: 3.33564
Epoch 1900 / 10000, training cost: 3.32918
Epoch 2000 / 10000, training cost: 3.32464
Epoch 2100 / 10000, training cost: 3.3251
Epoch 2200 / 10000, training cost: 3.32479
Epoch 2300 / 10000, training cost: 3.32483
Epoch 2400 / 10000, train