#### 任务: 设计网络结构，反转一个变长序列（最大长度N=20），即246910000反转为196420000，其中1-9为需要反转的有效字符，0为补位字符

**考核点:**
1. 序列长度很长时，如何记住前序信息
2. output structure 是序列的优化方法

**Constraints:**
1. 结构设计，不用调参，固定batch=32， lr=0.02, optimizer=Adam, epoch=1
2. 将序列中的数字映射到8维空间作为输入
3. 禁止直接将input与output层相连

In [28]:
import pandas as pd

In [29]:
df_train_input = pd.read_csv('./dataset/task8_train_input.csv', header=None)

In [30]:
df_train_input.shape

(32000, 20)

In [31]:
df_train_input.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,3,9,4,1,9,4,6,8,9,8,3,6,1,1,2,1,4,0,0,0
1,1,9,9,4,6,5,4,5,0,0,0,0,0,0,0,0,0,0,0,0
2,5,2,7,7,6,9,4,1,6,6,7,7,4,2,3,0,0,0,0,0
3,8,9,4,6,1,9,3,9,3,2,3,8,9,7,5,0,0,0,0,0
4,1,4,2,9,3,9,5,8,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
df_train_output = pd.read_csv('./dataset/task8_train_output.csv', header=None)

In [33]:
df_train_output.shape

(32000, 20)

In [34]:
df_train_output.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,4,1,2,1,1,6,3,8,9,8,6,4,9,1,4,9,3,0,0,0
1,5,4,5,6,4,9,9,1,0,0,0,0,0,0,0,0,0,0,0,0
2,3,2,4,7,7,6,6,1,4,9,6,7,7,2,5,0,0,0,0,0
3,5,7,9,8,3,2,3,9,3,9,1,6,4,9,8,0,0,0,0,0
4,8,5,9,3,9,2,4,1,0,0,0,0,0,0,0,0,0,0,0,0


#### 导入模块

In [1]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense

import numpy as np
import time

#### 超参数设置

In [53]:
# Number of Epochs
epochs = 2

# Batch Size
batch_size = 128

# RNN Size
rnn_size = 256

# Number of Layers
num_layers = 2

# Embedding Size
encoding_embedding_size = 30
decoding_embedding_size = 30

# Learning Rate
learning_rate = 0.01

In [54]:
with open('./dataset/task8_train_input.csv', 'r') as f:
    source_data = [''.join(line.split(',')).strip('\n').split('0')[0] for line in f.readlines()]
    

with open('./dataset/task8_train_output.csv', 'r') as f:
    target_data = [''.join(line.split(',')).strip('\n').split('0')[0] for line in f.readlines()]

In [55]:
print(source_data[:10])
print(target_data[:10])

['39419468983611214', '19946545', '527769416677423', '894619393238975', '14293958', '3542426796554612', '59454467', '323961935877612843', '95321559', '76339797441895826']
['41211638986491493', '54564991', '324776614967725', '579832393916498', '85939241', '2164556976242453', '76445495', '348216778539169323', '95512359', '62859814479793367']


In [56]:
def extract_character_vocab(data):
    """
    :param data:
    :return: 字符映射表
    """
    special_words = ['<PAD>','<UNK>','<GO>','<EOS>']
    set_words = list(set([character for line in data for character in line]))
    int_to_vocab = {idx:word for idx,word in enumerate(special_words + set_words)}
    vocab_to_int = {word:idx for idx,word in int_to_vocab.items()}

    return int_to_vocab,vocab_to_int

In [57]:
# 得到输入和输出的字符映射表
source_int_to_letter,source_letter_to_int = extract_character_vocab(source_data+target_data)

target_int_to_letter,target_letter_to_int = extract_character_vocab(source_data+target_data)

In [58]:
# 将每一行转换成字符id的list
source_int = [[source_letter_to_int.get(letter,source_letter_to_int['<UNK>'])
               for letter in line] for line in source_data]

In [59]:
target_int = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>'])
               for letter in line] + [target_letter_to_int['<EOS>']] for line in target_data]

In [60]:
len(source_int_to_letter)

13

#### 构建输入层

In [61]:
def get_inputs():

    inputs = tf.placeholder(tf.int32,[None,None],name='inputs')
    
    targets = tf.placeholder(tf.int32,[None,None],name='targets')
    
    learning_rate = tf.placeholder(tf.float32,name='learning_rate')

    # 定义target序列最大长度（之后target_sequence_length和source_sequence_length会作为feed_dict的参数）
    target_sequence_length = tf.placeholder(tf.int32,(None,),name='target_sequence_length')
    
    max_target_sequence_length = tf.reduce_max(target_sequence_length,name='max_target_len')
    
    source_sequence_length = tf.placeholder(tf.int32,(None,),name='source_sequence_length')

    return inputs,targets,learning_rate,target_sequence_length,max_target_sequence_length,source_sequence_length

#### Encoder

In [62]:
"""
在Encoder端，我们需要进行两步，第一步要对我们的输入进行Embedding，再把 Embedding 以后的向量传给RNN进行处理。
在Embedding中，我们使用tf.contrib.layers.embed_sequence，它会对每个batch执行embedding操作。
"""

def get_encoder_layer(input_data,rnn_size,num_layers,source_sequence_length,source_vocab_size,encoding_embedding_size):
    """
    构造Encoder层
    参数说明：
    - input_data: 输入tensor
    - rnn_size: rnn 隐层结点数量
    - num_layers:  堆叠的 rnn cell数量
    - source_sequence_length: 源数据的序列长度
    - source_vocab_size: 源数据的词典大小
    - encoding_embedding_size: embedding的大小
    """
    # https://www.tensorflow.org/versions/r1.4/api_docs/python/tf/contrib/layers/embed_sequence
    """
    embed_sequence(
    ids,
    vocab_size=None,
    embed_dim=None,
    unique=False,
    initializer=None,
    regularizer=None,
    trainable=True,
    scope=None,
    reuse=None
    )
    ids: [batch_size, doc_length] Tensor of type int32 or int64 with symbol ids.
    
    return : Tensor of [batch_size, doc_length, embed_dim] with embedded sequences.
    """
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data,source_vocab_size,encoding_embedding_size)

    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
        return lstm_cell

    cell =  tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(num_layers)])

    encoder_output , encoder_state = tf.nn.dynamic_rnn(cell,encoder_embed_input,sequence_length=source_sequence_length,dtype=tf.float32)

    return encoder_output,encoder_state

#### Decoder

紧接着构造 Decoder 部分，在将 sequence 送入 decoder 之前，需要做一步处理：

In [75]:
def process_decoder_input(data,vocab_to_int,batch_size):

    ending = tf.strided_slice(data,[0,0],[batch_size,-1],[1,1])
    
    decoder_input = tf.concat([tf.fill([batch_size,1],vocab_to_int['<GO>']),ending],1)

    return decoder_input


def decoding_layer(target_letter_to_int,decoding_embedding_size,num_layers,rnn_size,
                   target_sequence_length,max_target_sequence_length,encoder_state,decoder_input):
    '''
    构造Decoder层
    参数：
    - target_letter_to_int: target数据的映射表
    - decoding_embedding_size: embed向量大小
    - num_layers: 堆叠的RNN单元数量
    - rnn_size: RNN单元的隐层结点数量
    - target_sequence_length: target数据序列长度
    - max_target_sequence_length: target数据序列最大长度
    - encoder_state: encoder端编码的状态向量
    - decoder_input: decoder端输入
    '''

    # 1. Embedding
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size,decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings,decoder_input)

    # 2.构造Decoder中的RNN单元
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2))
        return decoder_cell

    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(rnn_size) for _ in range(num_layers)])

    # 3.Output 全连接层
    # target_vocab_size定义了输出层的大小
    output_layer = Dense(target_vocab_size,kernel_initializer=tf.truncated_normal_initializer(mean=0.1,stddev=0.1))


    # 4. Training decoder 用于训练过程
    with tf.variable_scope("decode"):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs = decoder_embed_input,
                                                            sequence_length = target_sequence_length,
                                                            time_major = False)


        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell,training_helper,encoder_state,output_layer)
        training_decoder_output,_,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,impute_finished=True,
                                                                        maximum_iterations = max_target_sequence_length)


    # 5. Predicting decoder
    # 与training共享参数
    with tf.variable_scope("decode", reuse=True):
        # 创建一个常量tensor并复制为batch_size的大小
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']],dtype=tf.int32),[batch_size],name='start_token')
        
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,start_tokens,target_letter_to_int['<EOS>'])

        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                             predicting_helper,
                                                             encoder_state,
                                                             output_layer)
        
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,impute_finished = True,
                                                                          maximum_iterations = max_target_sequence_length)


    return training_decoder_output,predicting_decoder_output

#### Seq2Seq Model

In [63]:
# 构建完 Encoder 和 Decoder，将这两部分连接起来，构建 seq2seq 模型
def seq2seq_model(input_data,targets,lr,target_sequence_length,max_target_sequence_length,
                  source_sequence_length,source_vocab_size,target_vocab_size,encoder_embedding_size,
                  decoder_embedding_size,rnn_size,num_layers):

    _, encoder_state = get_encoder_layer(input_data,
                                        rnn_size,
                                        num_layers,
                                        source_sequence_length,
                                        source_vocab_size,
                                        encoding_embedding_size)

    decoder_input = process_decoder_input(targets,target_letter_to_int,batch_size)

    training_decoder_output, predicting_decoder_output = decoding_layer(target_letter_to_int,
                                                                       decoding_embedding_size,
                                                                       num_layers,
                                                                       rnn_size,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       encoder_state,
                                                                       decoder_input)

    return training_decoder_output,predicting_decoder_output

#### 构造计算图

In [64]:
train_graph = tf.Graph()

with train_graph.as_default():
    
    input_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_inputs()

    training_decoder_output, predicting_decoder_output = seq2seq_model(input_data,
                                                                       targets,
                                                                       lr,
                                                                       target_sequence_length,
                                                                       max_target_sequence_length,
                                                                       source_sequence_length,
                                                                       len(source_letter_to_int),
                                                                       len(target_letter_to_int),
                                                                       encoding_embedding_size,
                                                                       decoding_embedding_size,
                                                                       rnn_size,
                                                                       num_layers)

    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    
    predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')

    #mask是权重的意思
    #tf.sequence_mask([1, 3, 2], 5)  # [[True, False, False, False, False],
                                #  [True, True, True, False, False],
                                #  [True, True, False, False, False]]
            
    masks = tf.sequence_mask(target_sequence_length,max_target_sequence_length,dtype=tf.float32,name="masks")

    # logits: A Tensor of shape [batch_size, sequence_length, num_decoder_symbols] and dtype float.
    # The logits correspond to the prediction across all classes at each timestep.
    # targets: A Tensor of shape [batch_size, sequence_length] and dtype int.
    # The target represents the true class at each timestep.
    # weights: A Tensor of shape [batch_size, sequence_length] and dtype float.
    # weights constitutes the weighting of each prediction in the sequence. When using weights as masking,
    # set all valid timesteps to 1 and all padded timesteps to 0, e.g. a mask returned by tf.sequence_mask.
    with tf.name_scope("optimization"):
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks
        )
        
        # 使用 AdamOptimizer 来最小化 loss
        optimizer = tf.train.AdamOptimizer(lr)

        # minimize函数用于添加操作节点，用于最小化loss，并更新var_list.
        # 该函数是简单的合并了compute_gradients()与apply_gradients()函数返回为一个优化更新后的var_list，
        # 如果global_step非None，该操作还会为global_step做自增操作

        #这里将minimize拆解为了以下两个部分：

        # 对var_list中的变量计算loss的梯度 该函数为函数minimize()的第一部分，返回一个以元组(gradient, variable)组成的列表
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        
        # 将计算出的梯度应用到变量上，是函数minimize()的第二部分，返回一个应用指定的梯度的操作Operation，对global_step做自增操作
        train_op = optimizer.apply_gradients(capped_gradients)

In [65]:
def pad_sentence_batch(digits_batch,pad_token_id):
    '''
    对 batch 中的 digits序列进行补全，保证 batch 中的每行都有相同的 sequence_length
    参数：
    - sentence batch
    - pad_int: <PAD>对应索引号
    '''
    max_digits = max([len(digits) for digits in digits_seq_batch])
    return [digits + [pad_token_id] * (max_digits - len(digits)) for digits in digits_seq_batch]


def get_batches(targets,sources,batch_size,source_pad_int,target_pad_int):

    for batch_i in range(0,len(sources)//batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i : start_i + batch_size]
        targets_batch = targets[start_i : start_i + batch_size]

        pad_sources_batch = np.array(pad_sentence_batch(sources_batch,source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch,target_pad_int))

        targets_lengths = []
        for target in targets_batch:
            targets_lengths.append(len(target))

        source_lengths = []
        for source in sources_batch:
            source_lengths.append(len(source))

        yield pad_targets_batch,pad_sources_batch,targets_lengths,source_lengths

#### 训练

In [66]:
train_source = source_int[batch_size:]
train_target = target_int[batch_size:]

# 留出一个 batch 进行验证
valid_source = source_int[:batch_size]
valid_target = target_int[:batch_size]

(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>']))

display_step = 50

# 模型保存路径
checkpoint = "./checkpoints/trained_model.ckpt"

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    # print()
    
    for epoch_i in range(1,epochs+1):
        for batch_i,(targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(get_batches(
            train_target,train_source,batch_size,source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>']
        )):
            _,loss = sess.run([train_op,cost],feed_dict={
                input_data:sources_batch,
                targets:targets_batch,
                lr:learning_rate,
                target_sequence_length:targets_lengths,
                source_sequence_length:sources_lengths
            })

            if batch_i % display_step == 0:
                # 计算validation loss
                validation_loss = sess.run(
                    [cost],
                    {input_data: valid_sources_batch,
                     targets: valid_targets_batch,
                     lr: learning_rate,
                     target_sequence_length: valid_targets_lengths,
                     source_sequence_length: valid_sources_lengths})

                print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epochs,
                              batch_i,
                              (len(train_source) // batch_size),
                              loss,
                              validation_loss[0]))

    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model Trained and Saved')


Epoch   1/2 Batch    0/249 - Training Loss:  2.591  - Validation loss:  2.473
Epoch   1/2 Batch   50/249 - Training Loss:  1.964  - Validation loss:  1.962
Epoch   1/2 Batch  100/249 - Training Loss:  1.798  - Validation loss:  1.753
Epoch   1/2 Batch  150/249 - Training Loss:  1.114  - Validation loss:  1.128
Epoch   1/2 Batch  200/249 - Training Loss:  0.681  - Validation loss:  0.683
Epoch   2/2 Batch    0/249 - Training Loss:  0.416  - Validation loss:  0.435
Epoch   2/2 Batch   50/249 - Training Loss:  0.332  - Validation loss:  0.302
Epoch   2/2 Batch  100/249 - Training Loss:  0.250  - Validation loss:  0.227
Epoch   2/2 Batch  150/249 - Training Loss:  0.176  - Validation loss:  0.174
Epoch   2/2 Batch  200/249 - Training Loss:  0.168  - Validation loss:  0.136
Model Trained and Saved


#### 预测

In [73]:
def source_to_seq(text):
    sequence_length = 24
    return [source_letter_to_int.get(word,source_letter_to_int['<UNK>']) for word in text] + [source_letter_to_int['<PAD>']] * (sequence_length - len(text))


#input_word = '73672498981'
input_word = '98574798281131845498'
text = source_to_seq(input_word)

checkpoint = "checkpoints/trained_model.ckpt"
loaded_graph = tf.Graph()

In [81]:
with tf.Session(graph=loaded_graph) as sess:
    
    # load model
    loader = tf.train.import_meta_graph(checkpoint+'.meta')
    
    # restore model
    loader.restore(sess,checkpoint)

    
    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')

    answer_logits = sess.run(logits, {input_data: [text] * batch_size,
                                      target_sequence_length: [len(input_word)] * batch_size,
                                      source_sequence_length: [len(input_word)] * batch_size})[0]

    pad = source_letter_to_int["<PAD>"]

    print('原始输入:', input_word)

    print('\nSource')
    print('  Input Words: {}'.format(" ".join([source_int_to_letter[i] for i in text]).replace('<PAD>', '0' )))
    print('  Word 编号:    {}'.format([i for i in text]))
    

    print('\nTarget')
    print('  Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))
    #print('  Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits])))
    print('  Word 编号:       {}'.format([i for i in answer_logits if i != pad]))
    

INFO:tensorflow:Restoring parameters from checkpoints/trained_model.ckpt
原始输入: 98574798281131845498

Source
  Input Words: 9 8 5 7 4 7 9 8 2 8 1 1 3 1 8 4 5 4 9 8 0 0 0 0
  Word 编号:    [12, 5, 11, 6, 4, 6, 12, 5, 9, 5, 8, 8, 7, 8, 5, 4, 11, 4, 12, 5, 0, 0, 0, 0]

Target
  Response Words: 8 9 4 5 4 8 1 3 1 8 1 2 8 9 7 4 5 7 5 9
  Word 编号:       [5, 12, 4, 11, 4, 5, 8, 7, 8, 5, 8, 9, 5, 12, 6, 4, 11, 6, 11, 12]
