Skip to content
This repository has been archived by the owner on Jan 24, 2024. It is now read-only.

Refine Chapter 8 WMT #339

Merged
merged 3 commits into from
Jul 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 54 additions & 58 deletions 08.machine_translation/README.cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,16 @@ is_generating = False
### 模型结构
1. 首先,定义了一些全局变量。

```python
dict_size = 30000 # 字典维度
source_dict_dim = dict_size # 源语言字典维度
target_dict_dim = dict_size # 目标语言字典维度
word_vector_dim = 512 # 词向量维度
encoder_size = 512 # 编码器中的GRU隐层大小
decoder_size = 512 # 解码器中的GRU隐层大小
beam_size = 3 # 柱宽度
max_length = 250 # 生成句子的最大长度
```
```python
dict_size = 30000 # 字典维度
source_dict_dim = dict_size # 源语言字典维度
target_dict_dim = dict_size # 目标语言字典维度
word_vector_dim = 512 # 词向量维度
encoder_size = 512 # 编码器中的GRU隐层大小
decoder_size = 512 # 解码器中的GRU隐层大小
beam_size = 3 # 柱宽度
max_length = 250 # 生成句子的最大长度
```

2. 其次,实现编码器框架。分为三步:

Expand All @@ -209,9 +209,7 @@ is_generating = False

```python
src_embedding = paddle.layer.embedding(
input=src_word_id,
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
input=src_word_id, size=word_vector_dim)
```
- 用双向GRU编码源语言序列,拼接两个GRU的编码结果得到$\mathbf{h}$。

Expand All @@ -228,19 +226,22 @@ is_generating = False
- 对源语言序列编码后的结果(见2的最后一步),过一个前馈神经网络(Feed Forward Neural Network),得到其映射。

```python
encoded_proj = paddle.layer.mixed(
size=decoder_size,
input=paddle.layer.full_matrix_projection(encoded_vector))
encoded_proj = paddle.layer.fc(
act=paddle.activation.Linear(),
size=decoder_size,
bias_attr=False,
input=encoded_vector)
```

- 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列,但在0时刻并没有初始值,所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射,作为该初始值,即$c_0=h_T$。

```python
backward_first = paddle.layer.first_seq(input=src_backward)
decoder_boot = paddle.layer.mixed(
size=decoder_size,
act=paddle.activation.Tanh(),
input=paddle.layer.full_matrix_projection(backward_first))
decoder_boot = paddle.layer.fc(
size=decoder_size,
act=paddle.activation.Tanh(),
bias_attr=False,
input=backward_first)
```

- 定义解码阶段每一个时间步的RNN行为,即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$,来预测第$i+1$个词的概率$p_{i+1}$。
Expand All @@ -260,12 +261,13 @@ is_generating = False
encoded_proj=enc_proj,
decoder_state=decoder_mem)

decoder_inputs = paddle.layer.mixed(
decoder_inputs = paddle.layer.fc(
act=paddle.activation.Linear(),
size=decoder_size * 3,
input=[
paddle.layer.full_matrix_projection(input=context),
paddle.layer.full_matrix_projection(input=current_word)
])
bias_attr=False,
input=[context, current_word],
layer_attr=paddle.attr.ExtraLayerAttribute(
error_clipping_threshold=100.0))

gru_step = paddle.layer.gru_step(
name='gru_decoder',
Expand All @@ -285,8 +287,8 @@ is_generating = False

```python
decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
group_input1 = paddle.layer.StaticInput(input=encoded_vector)
group_input2 = paddle.layer.StaticInput(input=encoded_proj)
group_inputs = [group_input1, group_input2]
```

Expand All @@ -301,7 +303,7 @@ is_generating = False
if not is_generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
Expand Down Expand Up @@ -330,14 +332,13 @@ is_generating = False

```python
if is_generating:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the last generated target word.
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the previous generated target word.

# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the last generated word is automatically gotten by
# GeneratedInputs, which is initialized by a start mark, such as <s>,
# and must be included in generation.
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the previous generated word is automatically retrieved
# by GeneratedInputs initialized by a start mark <s>.

trg_embedding = paddle.layer.GeneratedInput(
size=target_dict_dim,
Expand Down Expand Up @@ -468,36 +469,31 @@ is_generating = False

```python
if is_generating:
# get the dictionary
# load the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)

# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []

prob = beam_result[0]
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join(
[src_dict.get(w) for w in gen_data[i][0]]), "\n"
gen_sen_idx = np.where(beam_result[1] == -1)[0]
assert len(gen_sen_idx) == len(gen_data) * beam_size

# -1 is the delimiter of generated sequences.
# the first element of each generated sequence its length.
start_pos, end_pos = 1, 0
for i, sample in enumerate(gen_data):
print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
end_pos = gen_sen_idx[i * beam_size + j]
print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
start_pos = end_pos + 2
print("\n")
```

生成开始后,可以观察到输出的日志如下:
```text
src: <s> Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu <e>

prob = -19.019573: The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
prob = -19.113066: The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
prob = -19.512890: The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
-19.0196 The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
-19.1131 The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
-19.5129 The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
```

## 总结
Expand Down
114 changes: 55 additions & 59 deletions 08.machine_translation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,54 +230,55 @@ is_generating = False
decoder_size = 512 # hidden layer size of GRU in decoder
beam_size = 3 # expand width in beam search
max_length = 250 # a stop condition of sequence generation
```
```

2. Implement Encoder as follows:
- Input is a sequence of words represented by an integer word index sequence. So we define data layer of data type `integer_value_sequence`. The value range of each element in the sequence is `[0, source_dict_dim)`

```python
src_word_id = paddle.layer.data(
name='source_language_word',
type=paddle.data_type.integer_value_sequence(source_dict_dim))
src_word_id = paddle.layer.data(
name='source_language_word',
type=paddle.data_type.integer_value_sequence(source_dict_dim))
```

- Map the one-hot vector (represented by word index) into a word vector $\mathbf{s}$ in a low-dimensional semantic space

```python
src_embedding = paddle.layer.embedding(
input=src_word_id,
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
src_embedding = paddle.layer.embedding(
input=src_word_id, size=word_vector_dim)
```

- Use bi-direcitonal GRU to encode the source language sequence, and concatenate the encoding outputs from the two GRUs to get $\mathbf{h}$

```python
src_forward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size)
src_backward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
src_forward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size)
src_backward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
```

3. Implement Attention-based Decoder as follows:

- Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network

```python
encoded_proj = paddle.layer.mixed(
size=decoder_size,
input=paddle.layer.full_matrix_projection(encoded_vector))
encoded_proj = paddle.layer.fc(
act=paddle.activation.Linear(),
size=decoder_size,
bias_attr=False,
input=encoded_vector)
```

- Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$

```python
backward_first = paddle.layer.first_seq(input=src_backward)
decoder_boot = paddle.layer.mixed(
size=decoder_size,
act=paddle.activation.Tanh(),
input=paddle.layer.full_matrix_projection(backward_first))
decoder_boot = paddle.layer.fc(
size=decoder_size,
act=paddle.activation.Tanh(),
bias_attr=False,
input=backward_first)
```

- Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
Expand All @@ -298,33 +299,34 @@ is_generating = False
encoded_proj=enc_proj,
decoder_state=decoder_mem)

decoder_inputs = paddle.layer.mixed(
decoder_inputs = paddle.layer.fc(
act=paddle.activation.Linear(),
size=decoder_size * 3,
input=[
paddle.layer.full_matrix_projection(input=context),
paddle.layer.full_matrix_projection(input=current_word)
])
bias_attr=False,
input=[context, current_word],
layer_attr=paddle.attr.ExtraLayerAttribute(
error_clipping_threshold=100.0))

gru_step = paddle.layer.gru_step(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)

out = paddle.layer.mixed(
out = paddle.layer.fc(
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax(),
input=paddle.layer.full_matrix_projection(input=gru_step))
input=gru_step)
return out
```

4. Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.

```python
decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
group_input1 = paddle.layer.StaticInput(input=encoded_vector)
group_input2 = paddle.layer.StaticInput(input=encoded_proj)
group_inputs = [group_input1, group_input2]
```

Expand Down Expand Up @@ -369,13 +371,12 @@ is_generating = False
```python
if is_generating:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the last generated target word.
# the encoded source sequence and the previous generated target word.

# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the last generated word is automatically gotten by
# GeneratedInputs, which is initialized by a start mark, such as <s>,
# and must be included in generation.
# Embedding of the previous generated word is automatically retrieved
# by GeneratedInputs initialized by a start mark <s>.

trg_embedding = paddle.layer.GeneratedInput(
size=target_dict_dim,
Expand Down Expand Up @@ -504,36 +505,31 @@ Note: Our configuration is based on Bahdanau et al. \[[4](#Reference)\] but with

```python
if is_generating:
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)

# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []

prob = beam_result[0]
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join(
[src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
# load the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)

gen_sen_idx = np.where(beam_result[1] == -1)[0]
assert len(gen_sen_idx) == len(gen_data) * beam_size

# -1 is the delimiter of generated sequences.
# the first element of each generated sequence its length.
start_pos, end_pos = 1, 0
for i, sample in enumerate(gen_data):
print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
for j in xrange(beam_size):
end_pos = gen_sen_idx[i * beam_size + j]
print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
start_pos = end_pos + 2
print("\n")
```

The generating log is as follows:
```text
src: <s> Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu <e>

prob = -19.019573: The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
prob = -19.113066: The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
prob = -19.512890: The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
-19.0196 The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
-19.1131 The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
-19.5129 The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
```

## Summary
Expand Down
Loading