In [2]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


### 1.参数设置

In [3]:
# 参数设置
SRC_TRAIN_DATA = './data/train.en'
TRG_TRAIN_DATA = './data/train.zh'
CHECKPOINT_PATH = './data/seq2seq_ckpt'

HIDDEN_SIZE = 1024
NUM_LAYERS = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
NUM_EPOCH = 20
KEEP_PROB = 0.8
MAX_GRAD_NORM = 5
SHARE_EMB_AND_SOFTMAX = True

MAX_LEN = 50
SOS_ID = 1

### 2.读取训练数据并创建Dataset

In [4]:
# 使用Dataset从一个文件中读取一个语言的数据。
# 数据的格式为每行一句话，单词已经转化为单词编号。
def MakeDataset(file_path):
    dataset = tf.data.TextLineDataset(file_path)
    # 根据空格将单词编号切分开并放入一个一维向量。
    dataset = dataset.map(lambda string: tf.string_split([string]).values)
    # 将字符串形式的单词编号转化为整数。
    dataset = dataset.map(
        lambda string: tf.string_to_number(string, tf.int32))
    # 统计每个句子的单词数量，并与句子内容一起放入Dataset中。
    dataset = dataset.map(lambda x: (x, tf.size(x)))
    return dataset

# 从源语言文件src_path和目标语言文件trg_path中分别读取数据，并进行填充和
# batching操作。
def MakeSrcTrgDataset(src_path, trg_path, batch_size):
    # 首先分别读取源语言数据和目标语言数据。
    src_data = MakeDataset(src_path)
    trg_data = MakeDataset(trg_path)
    # 通过zip操作将两个Dataset合并为一个Dataset。现在每个Dataset中每一项数据ds
    # 由4个张量组成：
    #   ds[0][0]是源句子
    #   ds[0][1]是源句子长度
    #   ds[1][0]是目标句子
    #   ds[1][1]是目标句子长度
    dataset = tf.data.Dataset.zip((src_data, trg_data))

    # 删除内容为空（只包含<EOS>）的句子和长度过长的句子。
    def FilterLength(src_tuple, trg_tuple):
        ((src_input, src_len), (trg_label, trg_len)) = (src_tuple, trg_tuple)
        src_len_ok = tf.logical_and(
            tf.greater(src_len, 1), tf.less_equal(src_len, MAX_LEN))
        trg_len_ok = tf.logical_and(
            tf.greater(trg_len, 1), tf.less_equal(trg_len, MAX_LEN))
        return tf.logical_and(src_len_ok, trg_len_ok)
    dataset = dataset.filter(FilterLength)
    
    # 从图9-5可知，解码器需要两种格式的目标句子：
    #   1.解码器的输入(trg_input)，形式如同"<sos> X Y Z"
    #   2.解码器的目标输出(trg_label)，形式如同"X Y Z <eos>"
    # 上面从文件中读到的目标句子是"X Y Z <eos>"的形式，我们需要从中生成"<sos> X Y Z"
    # 形式并加入到Dataset中。
    def MakeTrgInput(src_tuple, trg_tuple):
        ((src_input, src_len), (trg_label, trg_len)) = (src_tuple, trg_tuple)
        trg_input = tf.concat([[SOS_ID], trg_label[:-1]], axis=0)
        return ((src_input, src_len), (trg_input, trg_label, trg_len))
    dataset = dataset.map(MakeTrgInput)

    # 随机打乱训练数据。
    dataset = dataset.shuffle(10000)

    # 规定填充后输出的数据维度。
    padded_shapes = (
        (tf.TensorShape([None]),      # 源句子是长度未知的向量
         tf.TensorShape([])),         # 源句子长度是单个数字
        (tf.TensorShape([None]),      # 目标句子（解码器输入）是长度未知的向量
         tf.TensorShape([None]),      # 目标句子（解码器目标输出）是长度未知的向量
         tf.TensorShape([])))         # 目标句子长度是单个数字
    # 调用padded_batch方法进行batching操作。
    batched_dataset = dataset.padded_batch(batch_size, padded_shapes)
    return batched_dataset


### 3. 定义翻译模型

In [5]:
# 定义NMTModel类来描述模型。
class NMTModel(object):
    # 在模型的初始化函数中定义模型要用到的变量。
    def __init__(self):
        # 定义编码器和解码器所使用的LSTM结构。
        self.enc_cell = tf.nn.rnn_cell.MultiRNNCell(
          [tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)
           for _ in range(NUM_LAYERS)])
        self.dec_cell = tf.nn.rnn_cell.MultiRNNCell(
          [tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE) 
           for _ in range(NUM_LAYERS)])

        # 为源语言和目标语言分别定义词向量。   
        self.src_embedding = tf.get_variable(
            "src_emb", [SRC_VOCAB_SIZE, HIDDEN_SIZE])
        self.trg_embedding = tf.get_variable(
            "trg_emb", [TRG_VOCAB_SIZE, HIDDEN_SIZE])

        # 定义softmax层的变量
        if SHARE_EMB_AND_SOFTMAX:
           self.softmax_weight = tf.transpose(self.trg_embedding)
        else:
           self.softmax_weight = tf.get_variable(
               "weight", [HIDDEN_SIZE, TRG_VOCAB_SIZE])
        self.softmax_bias = tf.get_variable(
            "softmax_bias", [TRG_VOCAB_SIZE])

    # 在forward函数中定义模型的前向计算图。
    # src_input, src_size, trg_input, trg_label, trg_size分别是上面
    # MakeSrcTrgDataset函数产生的五种张量。
    def forward(self, src_input, src_size, trg_input, trg_label, trg_size):
        batch_size = tf.shape(src_input)[0]
    
        # 将输入和输出单词编号转为词向量。
        src_emb = tf.nn.embedding_lookup(self.src_embedding, src_input)
        trg_emb = tf.nn.embedding_lookup(self.trg_embedding, trg_input)
        
        # 在词向量上进行dropout。
        src_emb = tf.nn.dropout(src_emb, KEEP_PROB)
        trg_emb = tf.nn.dropout(trg_emb, KEEP_PROB)

        # 使用dynamic_rnn构造编码器。
        # 编码器读取源句子每个位置的词向量，输出最后一步的隐藏状态enc_state。
        # 因为编码器是一个双层LSTM，因此enc_state是一个包含两个LSTMStateTuple类
        # 张量的tuple，每个LSTMStateTuple对应编码器中的一层。
        # enc_outputs是顶层LSTM在每一步的输出，它的维度是[batch_size, 
        # max_time, HIDDEN_SIZE]。Seq2Seq模型中不需要用到enc_outputs，而
        # 后面介绍的attention模型会用到它。
        with tf.variable_scope("encoder"):
            enc_outputs, enc_state = tf.nn.dynamic_rnn(
                self.enc_cell, src_emb, src_size, dtype=tf.float32)

        # 使用dyanmic_rnn构造解码器。
        # 解码器读取目标句子每个位置的词向量，输出的dec_outputs为每一步
        # 顶层LSTM的输出。dec_outputs的维度是 [batch_size, max_time,
        # HIDDEN_SIZE]。
        # initial_state=enc_state表示用编码器的输出来初始化第一步的隐藏状态。
        with tf.variable_scope("decoder"):
            dec_outputs, _ = tf.nn.dynamic_rnn(
                self.dec_cell, trg_emb, trg_size, initial_state=enc_state)

        # 计算解码器每一步的log perplexity。这一步与语言模型代码相同。
        output = tf.reshape(dec_outputs, [-1, HIDDEN_SIZE])
        logits = tf.matmul(output, self.softmax_weight) + self.softmax_bias
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=tf.reshape(trg_label, [-1]), logits=logits)

        # 在计算平均损失时，需要将填充位置的权重设置为0，以避免无效位置的预测干扰
        # 模型的训练。
        label_weights = tf.sequence_mask(
            trg_size, maxlen=tf.shape(trg_label)[1], dtype=tf.float32)
        label_weights = tf.reshape(label_weights, [-1])
        cost = tf.reduce_sum(loss * label_weights)
        cost_per_token = cost / tf.reduce_sum(label_weights)
        
        # 定义反向传播操作。反向操作的实现与语言模型代码相同。
        trainable_variables = tf.trainable_variables()

        # 控制梯度大小，定义优化方法和训练步骤。
        grads = tf.gradients(cost / tf.to_float(batch_size),
                             trainable_variables)
        grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
        train_op = optimizer.apply_gradients(
            zip(grads, trainable_variables))
        return cost_per_token, train_op

### 4. 训练过程和主函数

In [6]:
# 使用给定的模型model上训练一个epoch，并返回全局步数。
# 每训练200步便保存一个checkpoint。
def run_epoch(session, cost_op, train_op, saver, step):
    # 训练一个epoch。
    # 重复训练步骤直至遍历完Dataset中所有数据。
    while True:
        try:
            # 运行train_op并计算损失值。训练数据在main()函数中以Dataset方式提供。
            cost, _ = session.run([cost_op, train_op])
            if step % 10 == 0:
                print("After %d steps, per token cost is %.3f" % (step, cost))
            # 每200步保存一个checkpoint。
            if step % 200 == 0:
                saver.save(session, CHECKPOINT_PATH, global_step=step)
            step += 1
        except tf.errors.OutOfRangeError:
            break
    return step

def main():
    tf.reset_default_graph()
    # 定义初始化函数。
    initializer = tf.random_uniform_initializer(-0.05, 0.05)

    # 定义训练用的循环神经网络模型。
    with tf.variable_scope("nmt_model", reuse=None, 
                           initializer=initializer):
        train_model = NMTModel()
  
    # 定义输入数据。
    data = MakeSrcTrgDataset(SRC_TRAIN_DATA, TRG_TRAIN_DATA, BATCH_SIZE)
    iterator = data.make_initializable_iterator()
    (src, src_size), (trg_input, trg_label, trg_size) = iterator.get_next()
 
    # 定义前向计算图。输入数据以张量形式提供给forward函数。
    cost_op, train_op = train_model.forward(src, src_size, trg_input,
                                            trg_label, trg_size)

    # 训练模型。
    saver = tf.train.Saver()
    step = 0
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        for i in range(NUM_EPOCH):
            print("In iteration: %d" % (i + 1))
            sess.run(iterator.initializer)
            step = run_epoch(sess, cost_op, train_op, saver, step)
            
if __name__ == "__main__":
    main()

In iteration: 1
After 0 steps, per token cost is 8.293
After 10 steps, per token cost is 8.007
After 20 steps, per token cost is 9.576
After 30 steps, per token cost is 7.203
After 40 steps, per token cost is 6.832
After 50 steps, per token cost is 6.671
After 60 steps, per token cost is 6.574
After 70 steps, per token cost is 6.521
After 80 steps, per token cost is 6.653
After 90 steps, per token cost is 6.511
After 100 steps, per token cost is 6.369
After 110 steps, per token cost is 6.364
After 120 steps, per token cost is 6.215
After 130 steps, per token cost is 6.208
After 140 steps, per token cost is 6.086
After 150 steps, per token cost is 6.189
After 160 steps, per token cost is 5.917
After 170 steps, per token cost is 6.015
After 180 steps, per token cost is 6.092
After 190 steps, per token cost is 5.949
After 200 steps, per token cost is 5.960
After 210 steps, per token cost is 5.797
After 220 steps, per token cost is 5.753
After 230 steps, per token cost is 5.876
After 240 s

After 1970 steps, per token cost is 3.545
After 1980 steps, per token cost is 3.384
After 1990 steps, per token cost is 3.377
After 2000 steps, per token cost is 3.397
After 2010 steps, per token cost is 3.451
After 2020 steps, per token cost is 3.414
After 2030 steps, per token cost is 3.349
After 2040 steps, per token cost is 3.461
After 2050 steps, per token cost is 3.475
After 2060 steps, per token cost is 3.292
After 2070 steps, per token cost is 3.557
After 2080 steps, per token cost is 3.399
After 2090 steps, per token cost is 3.207
After 2100 steps, per token cost is 3.318
After 2110 steps, per token cost is 3.440
After 2120 steps, per token cost is 3.307
After 2130 steps, per token cost is 3.534
After 2140 steps, per token cost is 3.467
After 2150 steps, per token cost is 3.429
After 2160 steps, per token cost is 3.379
After 2170 steps, per token cost is 3.383
After 2180 steps, per token cost is 3.406
After 2190 steps, per token cost is 3.397
After 2200 steps, per token cost i

After 3910 steps, per token cost is 2.650
After 3920 steps, per token cost is 2.757
After 3930 steps, per token cost is 2.566
After 3940 steps, per token cost is 2.658
After 3950 steps, per token cost is 2.589
In iteration: 8
After 3960 steps, per token cost is 2.569
After 3970 steps, per token cost is 2.475
After 3980 steps, per token cost is 2.559
After 3990 steps, per token cost is 2.607
After 4000 steps, per token cost is 2.536
After 4010 steps, per token cost is 2.601
After 4020 steps, per token cost is 2.494
After 4030 steps, per token cost is 2.469
After 4040 steps, per token cost is 2.404
After 4050 steps, per token cost is 2.426
After 4060 steps, per token cost is 2.394
After 4070 steps, per token cost is 2.556
After 4080 steps, per token cost is 2.400
After 4090 steps, per token cost is 2.406
After 4100 steps, per token cost is 2.480
After 4110 steps, per token cost is 2.406
After 4120 steps, per token cost is 2.543
After 4130 steps, per token cost is 2.483
After 4140 steps, 

After 5850 steps, per token cost is 1.808
After 5860 steps, per token cost is 1.689
After 5870 steps, per token cost is 1.791
After 5880 steps, per token cost is 1.759
After 5890 steps, per token cost is 1.718
After 5900 steps, per token cost is 1.746
After 5910 steps, per token cost is 1.695
After 5920 steps, per token cost is 1.776
After 5930 steps, per token cost is 1.784
After 5940 steps, per token cost is 1.731
After 5950 steps, per token cost is 1.727
After 5960 steps, per token cost is 1.797
After 5970 steps, per token cost is 1.696
After 5980 steps, per token cost is 1.670
After 5990 steps, per token cost is 1.751
After 6000 steps, per token cost is 1.740
After 6010 steps, per token cost is 1.714
After 6020 steps, per token cost is 1.701
After 6030 steps, per token cost is 1.735
After 6040 steps, per token cost is 1.704
After 6050 steps, per token cost is 1.728
After 6060 steps, per token cost is 1.687
After 6070 steps, per token cost is 1.662
After 6080 steps, per token cost i

After 7800 steps, per token cost is 1.242
After 7810 steps, per token cost is 1.324
After 7820 steps, per token cost is 1.234
After 7830 steps, per token cost is 1.271
After 7840 steps, per token cost is 1.349
After 7850 steps, per token cost is 1.309
After 7860 steps, per token cost is 1.276
After 7870 steps, per token cost is 1.316
After 7880 steps, per token cost is 1.285
After 7890 steps, per token cost is 1.338
After 7900 steps, per token cost is 1.291
In iteration: 15
After 7910 steps, per token cost is 1.330
After 7920 steps, per token cost is 1.254
After 7930 steps, per token cost is 1.222
After 7940 steps, per token cost is 1.222
After 7950 steps, per token cost is 1.136
After 7960 steps, per token cost is 1.125
After 7970 steps, per token cost is 1.236
After 7980 steps, per token cost is 1.169
After 7990 steps, per token cost is 1.231
After 8000 steps, per token cost is 1.168
After 8010 steps, per token cost is 1.225
After 8020 steps, per token cost is 1.188
After 8030 steps,

After 9740 steps, per token cost is 0.954
After 9750 steps, per token cost is 0.962
After 9760 steps, per token cost is 0.881
After 9770 steps, per token cost is 0.914
After 9780 steps, per token cost is 1.000
After 9790 steps, per token cost is 0.943
After 9800 steps, per token cost is 0.934
After 9810 steps, per token cost is 0.869
After 9820 steps, per token cost is 0.911
After 9830 steps, per token cost is 0.965
After 9840 steps, per token cost is 0.965
After 9850 steps, per token cost is 0.954
After 9860 steps, per token cost is 0.927
After 9870 steps, per token cost is 0.895
After 9880 steps, per token cost is 0.968
After 9890 steps, per token cost is 0.902
After 9900 steps, per token cost is 0.909
After 9910 steps, per token cost is 0.881
After 9920 steps, per token cost is 0.870
After 9930 steps, per token cost is 0.924
After 9940 steps, per token cost is 0.922
After 9950 steps, per token cost is 0.932
After 9960 steps, per token cost is 0.953
After 9970 steps, per token cost i