diff --git a/08.machine_translation/README.cn.md b/08.machine_translation/README.cn.md index 887eddf2..0e57d9b9 100644 --- a/08.machine_translation/README.cn.md +++ b/08.machine_translation/README.cn.md @@ -185,16 +185,16 @@ is_generating = False ### 模型结构 1. 首先,定义了一些全局变量。 - ```python - dict_size = 30000 # 字典维度 - source_dict_dim = dict_size # 源语言字典维度 - target_dict_dim = dict_size # 目标语言字典维度 - word_vector_dim = 512 # 词向量维度 - encoder_size = 512 # 编码器中的GRU隐层大小 - decoder_size = 512 # 解码器中的GRU隐层大小 - beam_size = 3 # 柱宽度 - max_length = 250 # 生成句子的最大长度 - ``` + ```python + dict_size = 30000 # 字典维度 + source_dict_dim = dict_size # 源语言字典维度 + target_dict_dim = dict_size # 目标语言字典维度 + word_vector_dim = 512 # 词向量维度 + encoder_size = 512 # 编码器中的GRU隐层大小 + decoder_size = 512 # 解码器中的GRU隐层大小 + beam_size = 3 # 柱宽度 + max_length = 250 # 生成句子的最大长度 + ``` 2. 其次,实现编码器框架。分为三步: @@ -209,9 +209,7 @@ is_generating = False ```python src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) + input=src_word_id, size=word_vector_dim) ``` - 用双向GRU编码源语言序列,拼接两个GRU的编码结果得到$\mathbf{h}$。 @@ -228,19 +226,22 @@ is_generating = False - 对源语言序列编码后的结果(见2的最后一步),过一个前馈神经网络(Feed Forward Neural Network),得到其映射。 ```python - encoded_proj = paddle.layer.mixed( - size=decoder_size, - input=paddle.layer.full_matrix_projection(encoded_vector)) + encoded_proj = paddle.layer.fc( + act=paddle.activation.Linear(), + size=decoder_size, + bias_attr=False, + input=encoded_vector) ``` - 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列,但在0时刻并没有初始值,所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射,作为该初始值,即$c_0=h_T$。 ```python backward_first = paddle.layer.first_seq(input=src_backward) - decoder_boot = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(backward_first)) + decoder_boot = paddle.layer.fc( + size=decoder_size, + act=paddle.activation.Tanh(), + bias_attr=False, + input=backward_first) ``` - 定义解码阶段每一个时间步的RNN行为,即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$,来预测第$i+1$个词的概率$p_{i+1}$。 @@ -260,12 +261,13 @@ is_generating = False encoded_proj=enc_proj, decoder_state=decoder_mem) - decoder_inputs = paddle.layer.mixed( + decoder_inputs = paddle.layer.fc( + act=paddle.activation.Linear(), size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + bias_attr=False, + input=[context, current_word], + layer_attr=paddle.attr.ExtraLayerAttribute( + error_clipping_threshold=100.0)) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -285,8 +287,8 @@ is_generating = False ```python decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_input1 = paddle.layer.StaticInput(input=encoded_vector) + group_input2 = paddle.layer.StaticInput(input=encoded_proj) group_inputs = [group_input1, group_input2] ``` @@ -301,7 +303,7 @@ is_generating = False if not is_generating: trg_embedding = paddle.layer.embedding( input=paddle.layer.data( - name='target_language_word', + name='target_language_word', type=paddle.data_type.integer_value_sequence(target_dict_dim)), size=word_vector_dim, param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) @@ -330,14 +332,13 @@ is_generating = False ```python if is_generating: - # In generation, the decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. + # In generation, the decoder predicts a next target word based on + # the encoded source sequence and the previous generated target word. - # The encoded source sequence (encoder's output) must be specified by - # StaticInput, which is a read-only memory. - # Embedding of the last generated word is automatically gotten by - # GeneratedInputs, which is initialized by a start mark, such as , - # and must be included in generation. + # The encoded source sequence (encoder's output) must be specified by + # StaticInput, which is a read-only memory. + # Embedding of the previous generated word is automatically retrieved + # by GeneratedInputs initialized by a start mark . trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, @@ -468,36 +469,31 @@ is_generating = False ```python if is_generating: - # get the dictionary + # load the dictionary src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list = [] - seq = [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - for i in xrange(gen_num): - print "\n*******************************************************\n" - print "src:", ' '.join( - [src_dict.get(w) for w in gen_data[i][0]]), "\n" + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(gen_data) * beam_size + + # -1 is the delimiter of generated sequences. + # the first element of each generated sequence its length. + start_pos, end_pos = 1, 0 + for i, sample in enumerate(gen_data): + print(" ".join([src_dict[w] for w in sample[0][1:-1]])) for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") ``` 生成开始后,可以观察到输出的日志如下: ```text - src: Les se au sujet de la largeur des sièges alors que de grosses commandes sont en jeu - - prob = -19.019573: The will be rotated about the width of the seats , while large orders are at stake . - prob = -19.113066: The will be rotated about the width of the seats , while large commands are at stake . - prob = -19.512890: The will be rotated about the width of the seats , while large commands are at play . + Les se au sujet de la largeur des sièges alors que de grosses commandes sont en jeu + -19.0196 The will be rotated about the width of the seats , while large orders are at stake . + -19.1131 The will be rotated about the width of the seats , while large commands are at stake . + -19.5129 The will be rotated about the width of the seats , while large commands are at play . ``` ## 总结 diff --git a/08.machine_translation/README.md b/08.machine_translation/README.md index 227492ac..065e06e4 100644 --- a/08.machine_translation/README.md +++ b/08.machine_translation/README.md @@ -230,34 +230,32 @@ is_generating = False decoder_size = 512 # hidden layer size of GRU in decoder beam_size = 3 # expand width in beam search max_length = 250 # a stop condition of sequence generation - ``` + ``` 2. Implement Encoder as follows: - Input is a sequence of words represented by an integer word index sequence. So we define data layer of data type `integer_value_sequence`. The value range of each element in the sequence is `[0, source_dict_dim)` ```python - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) ``` - Map the one-hot vector (represented by word index) into a word vector $\mathbf{s}$ in a low-dimensional semantic space ```python - src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) + src_embedding = paddle.layer.embedding( + input=src_word_id, size=word_vector_dim) ``` - Use bi-direcitonal GRU to encode the source language sequence, and concatenate the encoding outputs from the two GRUs to get $\mathbf{h}$ ```python - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) ``` 3. Implement Attention-based Decoder as follows: @@ -265,19 +263,22 @@ is_generating = False - Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network ```python - encoded_proj = paddle.layer.mixed( - size=decoder_size, - input=paddle.layer.full_matrix_projection(encoded_vector)) + encoded_proj = paddle.layer.fc( + act=paddle.activation.Linear(), + size=decoder_size, + bias_attr=False, + input=encoded_vector) ``` - Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$ ```python backward_first = paddle.layer.first_seq(input=src_backward) - decoder_boot = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(backward_first)) + decoder_boot = paddle.layer.fc( + size=decoder_size, + act=paddle.activation.Tanh(), + bias_attr=False, + input=backward_first) ``` - Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word. @@ -298,12 +299,13 @@ is_generating = False encoded_proj=enc_proj, decoder_state=decoder_mem) - decoder_inputs = paddle.layer.mixed( + decoder_inputs = paddle.layer.fc( + act=paddle.activation.Linear(), size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + bias_attr=False, + input=[context, current_word], + layer_attr=paddle.attr.ExtraLayerAttribute( + error_clipping_threshold=100.0)) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -311,11 +313,11 @@ is_generating = False output_mem=decoder_mem, size=decoder_size) - out = paddle.layer.mixed( + out = paddle.layer.fc( size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) + input=gru_step) return out ``` @@ -323,8 +325,8 @@ is_generating = False ```python decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_input1 = paddle.layer.StaticInput(input=encoded_vector) + group_input2 = paddle.layer.StaticInput(input=encoded_proj) group_inputs = [group_input1, group_input2] ``` @@ -369,13 +371,12 @@ is_generating = False ```python if is_generating: # In generation, the decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. + # the encoded source sequence and the previous generated target word. # The encoded source sequence (encoder's output) must be specified by # StaticInput, which is a read-only memory. - # Embedding of the last generated word is automatically gotten by - # GeneratedInputs, which is initialized by a start mark, such as , - # and must be included in generation. + # Embedding of the previous generated word is automatically retrieved + # by GeneratedInputs initialized by a start mark . trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, @@ -504,36 +505,31 @@ Note: Our configuration is based on Bahdanau et al. \[[4](#Reference)\] but with ```python if is_generating: - # get the dictionary - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) - - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list = [] - seq = [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - for i in xrange(gen_num): - print "\n*******************************************************\n" - print "src:", ' '.join( - [src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] + # load the dictionary + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) + + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(gen_data) * beam_size + + # -1 is the delimiter of generated sequences. + # the first element of each generated sequence its length. + start_pos, end_pos = 1, 0 + for i, sample in enumerate(gen_data): + print(" ".join([src_dict[w] for w in sample[0][1:-1]])) + for j in xrange(beam_size): + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") ``` The generating log is as follows: ```text - src: Les se au sujet de la largeur des sièges alors que de grosses commandes sont en jeu - - prob = -19.019573: The will be rotated about the width of the seats , while large orders are at stake . - prob = -19.113066: The will be rotated about the width of the seats , while large commands are at stake . - prob = -19.512890: The will be rotated about the width of the seats , while large commands are at play . + Les se au sujet de la largeur des sièges alors que de grosses commandes sont en jeu + -19.0196 The will be rotated about the width of the seats , while large orders are at stake . + -19.1131 The will be rotated about the width of the seats , while large commands are at stake . + -19.5129 The will be rotated about the width of the seats , while large commands are at play . ``` ## Summary diff --git a/08.machine_translation/index.cn.html b/08.machine_translation/index.cn.html index 1b34fc20..f4986021 100644 --- a/08.machine_translation/index.cn.html +++ b/08.machine_translation/index.cn.html @@ -227,16 +227,16 @@ ### 模型结构 1. 首先,定义了一些全局变量。 - ```python - dict_size = 30000 # 字典维度 - source_dict_dim = dict_size # 源语言字典维度 - target_dict_dim = dict_size # 目标语言字典维度 - word_vector_dim = 512 # 词向量维度 - encoder_size = 512 # 编码器中的GRU隐层大小 - decoder_size = 512 # 解码器中的GRU隐层大小 - beam_size = 3 # 柱宽度 - max_length = 250 # 生成句子的最大长度 - ``` + ```python + dict_size = 30000 # 字典维度 + source_dict_dim = dict_size # 源语言字典维度 + target_dict_dim = dict_size # 目标语言字典维度 + word_vector_dim = 512 # 词向量维度 + encoder_size = 512 # 编码器中的GRU隐层大小 + decoder_size = 512 # 解码器中的GRU隐层大小 + beam_size = 3 # 柱宽度 + max_length = 250 # 生成句子的最大长度 + ``` 2. 其次,实现编码器框架。分为三步: @@ -251,9 +251,7 @@ ```python src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) + input=src_word_id, size=word_vector_dim) ``` - 用双向GRU编码源语言序列,拼接两个GRU的编码结果得到$\mathbf{h}$。 @@ -270,19 +268,22 @@ - 对源语言序列编码后的结果(见2的最后一步),过一个前馈神经网络(Feed Forward Neural Network),得到其映射。 ```python - encoded_proj = paddle.layer.mixed( - size=decoder_size, - input=paddle.layer.full_matrix_projection(encoded_vector)) + encoded_proj = paddle.layer.fc( + act=paddle.activation.Linear(), + size=decoder_size, + bias_attr=False, + input=encoded_vector) ``` - 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列,但在0时刻并没有初始值,所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射,作为该初始值,即$c_0=h_T$。 ```python backward_first = paddle.layer.first_seq(input=src_backward) - decoder_boot = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(backward_first)) + decoder_boot = paddle.layer.fc( + size=decoder_size, + act=paddle.activation.Tanh(), + bias_attr=False, + input=backward_first) ``` - 定义解码阶段每一个时间步的RNN行为,即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$,来预测第$i+1$个词的概率$p_{i+1}$。 @@ -302,12 +303,13 @@ encoded_proj=enc_proj, decoder_state=decoder_mem) - decoder_inputs = paddle.layer.mixed( + decoder_inputs = paddle.layer.fc( + act=paddle.activation.Linear(), size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + bias_attr=False, + input=[context, current_word], + layer_attr=paddle.attr.ExtraLayerAttribute( + error_clipping_threshold=100.0)) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -327,8 +329,8 @@ ```python decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_input1 = paddle.layer.StaticInput(input=encoded_vector) + group_input2 = paddle.layer.StaticInput(input=encoded_proj) group_inputs = [group_input1, group_input2] ``` @@ -343,7 +345,7 @@ if not is_generating: trg_embedding = paddle.layer.embedding( input=paddle.layer.data( - name='target_language_word', + name='target_language_word', type=paddle.data_type.integer_value_sequence(target_dict_dim)), size=word_vector_dim, param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) @@ -372,14 +374,13 @@ ```python if is_generating: - # In generation, the decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. + # In generation, the decoder predicts a next target word based on + # the encoded source sequence and the previous generated target word. - # The encoded source sequence (encoder's output) must be specified by - # StaticInput, which is a read-only memory. - # Embedding of the last generated word is automatically gotten by - # GeneratedInputs, which is initialized by a start mark, such as , - # and must be included in generation. + # The encoded source sequence (encoder's output) must be specified by + # StaticInput, which is a read-only memory. + # Embedding of the previous generated word is automatically retrieved + # by GeneratedInputs initialized by a start mark . trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, @@ -510,36 +511,31 @@ ```python if is_generating: - # get the dictionary + # load the dictionary src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list = [] - seq = [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - for i in xrange(gen_num): - print "\n*******************************************************\n" - print "src:", ' '.join( - [src_dict.get(w) for w in gen_data[i][0]]), "\n" + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(gen_data) * beam_size + + # -1 is the delimiter of generated sequences. + # the first element of each generated sequence its length. + start_pos, end_pos = 1, 0 + for i, sample in enumerate(gen_data): + print(" ".join([src_dict[w] for w in sample[0][1:-1]])) for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") ``` 生成开始后,可以观察到输出的日志如下: ```text - src: Les se au sujet de la largeur des sièges alors que de grosses commandes sont en jeu - - prob = -19.019573: The will be rotated about the width of the seats , while large orders are at stake . - prob = -19.113066: The will be rotated about the width of the seats , while large commands are at stake . - prob = -19.512890: The will be rotated about the width of the seats , while large commands are at play . + Les se au sujet de la largeur des sièges alors que de grosses commandes sont en jeu + -19.0196 The will be rotated about the width of the seats , while large orders are at stake . + -19.1131 The will be rotated about the width of the seats , while large commands are at stake . + -19.5129 The will be rotated about the width of the seats , while large commands are at play . ``` ## 总结 diff --git a/08.machine_translation/index.html b/08.machine_translation/index.html index 5d58c9dd..e5255745 100644 --- a/08.machine_translation/index.html +++ b/08.machine_translation/index.html @@ -272,34 +272,32 @@ decoder_size = 512 # hidden layer size of GRU in decoder beam_size = 3 # expand width in beam search max_length = 250 # a stop condition of sequence generation - ``` + ``` 2. Implement Encoder as follows: - Input is a sequence of words represented by an integer word index sequence. So we define data layer of data type `integer_value_sequence`. The value range of each element in the sequence is `[0, source_dict_dim)` ```python - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) ``` - Map the one-hot vector (represented by word index) into a word vector $\mathbf{s}$ in a low-dimensional semantic space ```python - src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) + src_embedding = paddle.layer.embedding( + input=src_word_id, size=word_vector_dim) ``` - Use bi-direcitonal GRU to encode the source language sequence, and concatenate the encoding outputs from the two GRUs to get $\mathbf{h}$ ```python - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) ``` 3. Implement Attention-based Decoder as follows: @@ -307,19 +305,22 @@ - Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network ```python - encoded_proj = paddle.layer.mixed( - size=decoder_size, - input=paddle.layer.full_matrix_projection(encoded_vector)) + encoded_proj = paddle.layer.fc( + act=paddle.activation.Linear(), + size=decoder_size, + bias_attr=False, + input=encoded_vector) ``` - Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$ ```python backward_first = paddle.layer.first_seq(input=src_backward) - decoder_boot = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(backward_first)) + decoder_boot = paddle.layer.fc( + size=decoder_size, + act=paddle.activation.Tanh(), + bias_attr=False, + input=backward_first) ``` - Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word. @@ -340,12 +341,13 @@ encoded_proj=enc_proj, decoder_state=decoder_mem) - decoder_inputs = paddle.layer.mixed( + decoder_inputs = paddle.layer.fc( + act=paddle.activation.Linear(), size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + bias_attr=False, + input=[context, current_word], + layer_attr=paddle.attr.ExtraLayerAttribute( + error_clipping_threshold=100.0)) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -353,11 +355,11 @@ output_mem=decoder_mem, size=decoder_size) - out = paddle.layer.mixed( + out = paddle.layer.fc( size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) + input=gru_step) return out ``` @@ -365,8 +367,8 @@ ```python decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_input1 = paddle.layer.StaticInput(input=encoded_vector) + group_input2 = paddle.layer.StaticInput(input=encoded_proj) group_inputs = [group_input1, group_input2] ``` @@ -411,13 +413,12 @@ ```python if is_generating: # In generation, the decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. + # the encoded source sequence and the previous generated target word. # The encoded source sequence (encoder's output) must be specified by # StaticInput, which is a read-only memory. - # Embedding of the last generated word is automatically gotten by - # GeneratedInputs, which is initialized by a start mark, such as , - # and must be included in generation. + # Embedding of the previous generated word is automatically retrieved + # by GeneratedInputs initialized by a start mark . trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, @@ -546,36 +547,31 @@ ```python if is_generating: - # get the dictionary - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) - - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list = [] - seq = [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - for i in xrange(gen_num): - print "\n*******************************************************\n" - print "src:", ' '.join( - [src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] + # load the dictionary + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) + + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(gen_data) * beam_size + + # -1 is the delimiter of generated sequences. + # the first element of each generated sequence its length. + start_pos, end_pos = 1, 0 + for i, sample in enumerate(gen_data): + print(" ".join([src_dict[w] for w in sample[0][1:-1]])) + for j in xrange(beam_size): + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") ``` The generating log is as follows: ```text - src: Les se au sujet de la largeur des sièges alors que de grosses commandes sont en jeu - - prob = -19.019573: The will be rotated about the width of the seats , while large orders are at stake . - prob = -19.113066: The will be rotated about the width of the seats , while large commands are at stake . - prob = -19.512890: The will be rotated about the width of the seats , while large commands are at play . + Les se au sujet de la largeur des sièges alors que de grosses commandes sont en jeu + -19.0196 The will be rotated about the width of the seats , while large orders are at stake . + -19.1131 The will be rotated about the width of the seats , while large commands are at stake . + -19.5129 The will be rotated about the width of the seats , while large commands are at play . ``` ## Summary diff --git a/08.machine_translation/train.py b/08.machine_translation/train.py index a1394366..8f8af654 100644 --- a/08.machine_translation/train.py +++ b/08.machine_translation/train.py @@ -1,25 +1,31 @@ import sys +import gzip +import numpy as np import paddle.v2 as paddle -def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): +def save_model(parameters, save_path): + with gzip.open(save_path, 'w') as f: + parameters.to_tar(f) + + +def seq_to_seq_net(source_dict_dim, + target_dict_dim, + is_generating, + beam_size=3, + max_length=250): ### Network Architecture word_vector_dim = 512 # dimension of word vector - decoder_size = 512 # dimension of hidden unit in GRU Decoder network - encoder_size = 512 # dimension of hidden unit in GRU Encoder network - - beam_size = 3 - max_length = 250 + decoder_size = 512 # dimension of hidden unit of GRU decoder + encoder_size = 512 # dimension of hidden unit of GRU encoder #### Encoder src_word_id = paddle.layer.data( name='source_language_word', type=paddle.data_type.integer_value_sequence(source_dict_dim)) src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) + input=src_word_id, size=word_vector_dim) src_forward = paddle.networks.simple_gru( input=src_embedding, size=encoder_size) src_backward = paddle.networks.simple_gru( @@ -27,16 +33,19 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) #### Decoder - encoded_proj = paddle.layer.mixed( + encoded_proj = paddle.layer.fc( + act=paddle.activation.Linear(), size=decoder_size, - input=paddle.layer.full_matrix_projection(encoded_vector)) + bias_attr=False, + input=encoded_vector) backward_first = paddle.layer.first_seq(input=src_backward) - decoder_boot = paddle.layer.mixed( + decoder_boot = paddle.layer.fc( size=decoder_size, act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(backward_first)) + bias_attr=False, + input=backward_first) def gru_decoder_with_attention(enc_vec, enc_proj, current_word): @@ -48,12 +57,13 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word): encoded_proj=enc_proj, decoder_state=decoder_mem) - decoder_inputs = paddle.layer.mixed( + decoder_inputs = paddle.layer.fc( + act=paddle.activation.Linear(), size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + bias_attr=False, + input=[context, current_word], + layer_attr=paddle.attr.ExtraLayerAttribute( + error_clipping_threshold=100.0)) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -61,16 +71,16 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word): output_mem=decoder_mem, size=decoder_size) - out = paddle.layer.mixed( + out = paddle.layer.fc( size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) + input=gru_step) return out - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + decoder_group_name = 'decoder_group' + group_input1 = paddle.layer.StaticInput(input=encoded_vector) + group_input2 = paddle.layer.StaticInput(input=encoded_proj) group_inputs = [group_input1, group_input2] if not is_generating: @@ -100,13 +110,12 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word): return cost else: # In generation, the decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. + # the encoded source sequence and the previous generated target word. # The encoded source sequence (encoder's output) must be specified by # StaticInput, which is a read-only memory. - # Embedding of the last generated word is automatically gotten by - # GeneratedInputs, which is initialized by a start mark, such as , - # and must be included in generation. + # Embedding of the previous generated word is automatically retrieved + # by GeneratedInputs initialized by a start mark . trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, @@ -136,32 +145,43 @@ def main(): # train the network if not is_generating: - cost = seqToseq_net(source_dict_dim, target_dict_dim) - parameters = paddle.parameters.create(cost) - # define optimize method and trainer optimizer = paddle.optimizer.Adam( learning_rate=5e-5, regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + + cost = seq_to_seq_net(source_dict_dim, target_dict_dim, is_generating) + parameters = paddle.parameters.create(cost) + trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) # define data reader wmt14_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size), buf_size=8192), - batch_size=5) + batch_size=4) # define event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, - event.metrics) + print("\nPass %d, Batch %d, Cost %f, %s" % + (event.pass_id, event.batch_id, event.cost, + event.metrics)) else: sys.stdout.write('.') sys.stdout.flush() + if not event.batch_id % 10: + save_path = 'params_pass_%05d_batch_%05d.tar.gz' % ( + event.pass_id, event.batch_id) + save_model(parameters, save_path) + + if isinstance(event, paddle.event.EndPass): + # save parameters + save_path = 'params_pass_%05d.tar.gz' % (event.pass_id) + save_model(parameters, save_path) + # start to train trainer.train( reader=wmt14_reader, event_handler=event_handler, num_passes=2) @@ -169,17 +189,20 @@ def event_handler(event): # generate a english sequence to french else: # use the first 3 samples for generation - gen_creator = paddle.dataset.wmt14.gen(dict_size) gen_data = [] gen_num = 3 - for item in gen_creator(): - gen_data.append((item[0], )) + for item in paddle.dataset.wmt14.gen(dict_size)(): + gen_data.append([item[0]]) if len(gen_data) == gen_num: break - beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating) - # get the pretrained model, whose bleu = 26.92 + beam_size = 3 + beam_gen = seq_to_seq_net(source_dict_dim, target_dict_dim, + is_generating, beam_size) + + # get the trained model, whose bleu = 26.92 parameters = paddle.dataset.wmt14.model() + # prob is the prediction probabilities, and id is the prediction word. beam_result = paddle.infer( output_layer=beam_gen, @@ -187,28 +210,25 @@ def event_handler(event): input=gen_data, field=['prob', 'id']) - # get the dictionary + # load the dictionary src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list = [] - seq = [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - beam_size = 3 - for i in xrange(gen_num): - print "\n*******************************************************\n" - print "src:", ' '.join( - [src_dict.get(w) for w in gen_data[i][0]]), "\n" + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(gen_data) * beam_size + + # -1 is the delimiter of generated sequences. + # the first element of each generated sequence its length. + start_pos, end_pos = 1, 0 + for i, sample in enumerate(gen_data): + print( + " ".join([src_dict[w] for w in sample[0][1:-1]]) + ) # skip the start and ending mark when printing the source sentence for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") if __name__ == '__main__':