# 1 导入所需模块

In [4]:
import numpy as np
import tensorflow as tf
from collections import Counter
import os

  from ._conv import register_converters as _register_converters


# 2 预处理语句

#### 读取文本文件

In [5]:
def _read_words(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return f.read().replace('\n', '<eos>').split()

#### 分词成list并赋予id

In [6]:
def _build_vocab(filename):
    data = _read_words(filename)

    counter = Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: -x[1])

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))

    return words, word_to_id

#### 读取文件对应的id

In [7]:
def _file_to_word_ids(filename, word_to_id):
    data = _read_words(filename)
    return [word_to_id[x] for x in data if x in word_to_id]

#### 将id序列转化为句子

In [8]:
def to_words(sentence, words):
    return list(map(lambda x: words[x], sentence))

#### 生成训练与验证数据

In [9]:
def _raw_data(data_path):
    train_path = os.path.join(data_path, 'train.txt')
    valid_path = os.path.join(data_path, 'valid.txt')
    test_path = os.path.join(data_path, 'test.txt')

    words, word_to_id = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)

    return train_data, valid_data, test_data, words, word_to_id

#### 产生训练batch

In [10]:
def _producer(raw_data, batch_size=64, num_steps=20, stride=1):
    data_len = len(raw_data)

    sentences = []
    next_words = []
    for i in range(0, data_len - num_steps, stride):
        sentences.append(raw_data[i:(i + num_steps)])
        next_words.append(raw_data[i + num_steps])

    sentences = np.array(sentences)
    next_words = np.array(next_words)

    batch_len = len(sentences) // batch_size
    x = np.reshape(sentences[:(batch_len * batch_size)], \
        [batch_len, batch_size, -1])

    y = np.reshape(next_words[:(batch_len * batch_size)], \
        [batch_len, batch_size])

    return x, y

#### 观察x，y

In [13]:
train_data, valid_data, _, words, word_to_id = _raw_data('/Users/Optimus-Prime/Documents/My_Jupyter/Language Model')
x_train, y_train = _producer(train_data)
x_valid, y_valid = _producer(valid_data)
print(x_train.shape)
print(y_valid.shape)

(14524, 64, 20)
(1152, 64)


# 3 构建模型
### 模型参数

In [16]:
class LMConfig(object):
    """language model 配置项"""
    batch_size = 64       # 每一批数据的大小
    num_steps = 20        # 每一个句子的长度
    stride = 3            # 取数据时的步长
    vocab_size = 10000       # 词汇表大小

    embedding_dim = 64    # 词向量维度
    hidden_dim = 128      # RNN隐藏层维度
    num_layers = 2        # RNN层数

    learning_rate = 0.05  # 学习率
    dropout = 0.8         # 每一层后的丢弃概率
    rnn_model = 'gru'

### 读取数据

In [15]:
class _Input(object):
    """按批次读取数据"""
    def __init__(self, config, data):
        self.batch_size = config.batch_size
        self.num_steps = config.num_steps
        self.vocab_size = config.vocab_size # 词汇表大小

        self.input_data, self.targets = _producer(data,
            self.batch_size, self.num_steps)

        self.batch_len = self.input_data.shape[0] # 总批次
        self.cur_batch = 0  # 当前批次

    def next_batch(self):
        """读取下一批次"""
        x = self.input_data[self.cur_batch]
        y = self.targets[self.cur_batch]

        # 转换为one-hot编码
        y_ = np.zeros((y.shape[0], self.vocab_size), dtype=np.bool)
        for i in range(y.shape[0]):
            y_[i][y[i]] = 1

        # 如果到最后一个批次，则回到最开头
        self.cur_batch = (self.cur_batch +1) % self.batch_len

        return x, y_

### 模型——LSTM+GRU

In [17]:
class Model(object):
    def __init__(self, config, is_training=True):

        self.num_steps = config.num_steps
        self.vocab_size = config.vocab_size

        self.embedding_dim = config.embedding_dim
        self.hidden_dim = config.hidden_dim
        self.num_layers = config.num_layers
        self.rnn_model = config.rnn_model

        self.learning_rate = config.learning_rate
        #self.dropout = config.dropout
        

        self.placeholders()  # 输入占位符
        self.rnn()           # rnn 模型构建
        self.cost()          # 代价函数
        self.optimize()      # 优化器
        self.error()         # 错误率


    def placeholders(self):
        """输入数据的占位符"""
        self._inputs = tf.placeholder(tf.int32, [None, self.num_steps])
        self._targets = tf.placeholder(tf.int32, [None, self.vocab_size])
        self.dropout = tf.placeholder(tf.float32)

    def input_embedding(self):
        """将输入转换为词向量表示"""
        with tf.device("/cpu:0"):
            embedding = tf.get_variable(
                "embedding", [self.vocab_size,
                    self.embedding_dim], dtype=tf.float32)
            _inputs = tf.nn.embedding_lookup(embedding, self._inputs)

        return _inputs


    def rnn(self):
        """rnn模型构建"""
        def lstm_cell():  # 基本的lstm cell
            return tf.contrib.rnn.BasicLSTMCell(self.hidden_dim,
                state_is_tuple=True)

        def gru_cell():   # gru cell，速度更快
            return tf.contrib.rnn.GRUCell(self.hidden_dim)

        def dropout_cell():    # 在每个cell后添加dropout
            if (self.rnn_model == 'lstm'):
                cell = lstm_cell()
            else:
                cell = gru_cell()
            return tf.contrib.rnn.DropoutWrapper(cell,
                output_keep_prob=self.dropout)

        cells = [dropout_cell() for _ in range(self.num_layers)]
        cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)  # 多层rnn

        _inputs = self.input_embedding()
        _outputs, _ = tf.nn.dynamic_rnn(cell=cell,
            inputs=_inputs, dtype=tf.float32)

        # _outputs的shape为 [batch_size, num_steps, hidden_dim]
        last = _outputs[:, -1, :]  # 只需要最后一个输出

        # dense 和 softmax 用于分类，以找出各词的概率
        logits = tf.layers.dense(inputs=last, units=self.vocab_size)   
        prediction = tf.nn.softmax(logits)  

        self._logits = logits
        self._pred = prediction

    def cost(self):
        """计算交叉熵代价函数"""
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=self._logits, labels=self._targets)
        cost = tf.reduce_mean(cross_entropy)
        self.cost = cost

    def optimize(self):
        """使用adam优化器"""
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.optim = optimizer.minimize(self.cost)

    def error(self):
        """计算错误率"""
        mistakes = tf.not_equal(
            tf.argmax(self._targets, 1), tf.argmax(self._pred, 1))
        self.errors = tf.reduce_mean(tf.cast(mistakes, tf.float32))

### 训练及验证

In [34]:
def run_epoch(num_epochs=10):
    config = LMConfig()   # 载入参数

    # 载入源数据，这里只需要训练集
    train_data, valid_data, _, words, word_to_id = _raw_data('/Users/Optimus-Prime/Documents/My_Jupyter/Language Model')
    config.vocab_size = len(words)

    # 数据分批
    input_train = _Input(config, train_data)
    input_valid = _Input(config, valid_data)
    batch_len = input_train.batch_len

    # 构建模型
    model = Model(config)

    # 创建session，初始化变量
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    print('Start training...')
    for epoch in range(num_epochs):  # 迭代轮次
        for i in range(batch_len):   # 经过多少个batch
            x_batch, y_batch = input_train.next_batch()
            x_v_batch, y_v_batch = input_valid.next_batch()
            # 取一个批次的数据，运行优化
            feed_dict = {model._inputs: x_batch, model._targets: y_batch, model.dropout: config.dropout}
            feed_dict_v = {model._inputs: x_v_batch, model._targets: y_v_batch, model.dropout: 1}
            sess.run(model.optim, feed_dict=feed_dict)

            # 每500个batch，输出一次中间结果
            if i % 100 == 0:
                cost,error = sess.run(model.cost, model.error, feed_dict=feed_dict)
                msg = "Epoch: {0:>3}, batch: {1:>6}, Loss: {2:>6.3}, error: {3:>6.3}"
                print(msg.format(epoch + 1, i + 1, cost, error))

                # 输出验证集结果
                total_error = 0
                for j in range(20):
                    error = sess.run(model.error, feed_dict=feed_dict_v)
                    total_error += error/20
                print('validation error: ',total_error)
    print('Finish training...')
    sess.close()

In [None]:
run_epoch(10)

Instructions for updating:
Use the retry module or similar alternatives.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

Start training...
Epoch:   1, batch:      1, Loss:    9.1
Predicted: <eos> chairman a is is years director years director once as <eos> british director <eos> british the N director <eos> <unk> and N old n.v. dutch british the N <eos> dutch dutch as <eos> group the <eos> <eos> nov. N dutch N nov. <unk> dutch <unk> N rifenburgh years N N <eos> gold N <eos> <eos> nov. director <eos> <eos> nov. as ssangyong the
True: snack-food ssangyong swapo wachter <eos> pierre <unk> N years old will join the board as a nonexecutive director nov. N <eos> mr. <unk> is chairman of <unk> n.v. the dutch publishing group <eos> rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this briti