diff --git a/fluid/neural_machine_translation/rnn_search/README.md b/fluid/neural_machine_translation/rnn_search/README.md new file mode 100644 index 0000000000..6a20f8ce94 --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/README.md @@ -0,0 +1,13 @@ +The minimum PaddlePaddle version needed for the code sample in this directory is the lastest develop branch. If you are on a version of PaddlePaddle earlier than this, [please update your installation](http://www.paddlepaddle.org/docs/develop/documentation/en/build_and_install/pip_install_en.html). + +### RNN Search + +Training +``` +python train.py +``` + +--- +### TODO + +This project is still under active development. diff --git a/fluid/neural_machine_translation/rnn_search/config.py b/fluid/neural_machine_translation/rnn_search/config.py new file mode 100644 index 0000000000..dd2a81e371 --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/config.py @@ -0,0 +1,19 @@ +class TrainConfig(object): + use_gpu = False + infer_only = False + parallel = False + batch_size = 16 + pass_num = 5 + learning_rate = 0.0002 + buf_size = 100000 + + +class ModelConfig(object): + embedding_dim = 512 + encoder_size = 512 + decoder_size = 512 + source_dict_dim = 10000 + target_dict_dim = 10000 + is_generating = False + beam_size = 3 + max_length = 250 diff --git a/fluid/neural_machine_translation/rnn_search/model.py b/fluid/neural_machine_translation/rnn_search/model.py new file mode 100644 index 0000000000..bed95f7ee2 --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/model.py @@ -0,0 +1,141 @@ +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.fluid as fluid + + +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def seq_to_seq_net(src_word_idx, trg_word_idx, label, embedding_dim, + encoder_size, decoder_size, source_dict_dim, target_dict_dim, + is_generating, beam_size, max_length): + """Construct a seq2seq network.""" + + def bi_lstm_encoder(input_seq, gate_size): + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. + input_forward_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) + return forward, reversed + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, gate_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, + decoder_boot, decoder_size): + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + concated = fluid.layers.concat( + input=[encoder_proj, decoder_state_expand], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + act='tanh', + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax(attention_weights) + weigths_reshape = fluid.layers.reshape( + x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + rnn = fluid.layers.DynamicRNN() + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + with rnn.block(): + current_word = rnn.step_input(target_embedding) + encoder_vec = rnn.static_input(encoder_vec) + encoder_proj = rnn.static_input(encoder_proj) + hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) + cell_mem = rnn.memory(init=cell_init) + context = simple_attention(encoder_vec, encoder_proj, hidden_mem) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size) + rnn.update_memory(hidden_mem, h) + rnn.update_memory(cell_mem, c) + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + rnn.output(out) + return rnn() + + if not is_generating: + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32') + + prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, + encoded_proj, decoder_boot, + decoder_size) + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost diff --git a/fluid/neural_machine_translation/rnn_search/train.py b/fluid/neural_machine_translation/rnn_search/train.py new file mode 100644 index 0000000000..9a581473ee --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/train.py @@ -0,0 +1,172 @@ +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time + +import paddle.v2 as paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor + +from model import seq_to_seq_net +from config import TrainConfig as train_conf +from config import ModelConfig as model_conf + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + lod_t = core.LoDTensor() + lod_t.set(flattened_data, place) + lod_t.set_lod([lod]) + return lod_t, lod[-1] + + +def lodtensor_to_ndarray(lod_tensor): + dims = lod_tensor.get_dims() + ndarray = np.zeros(shape=dims).astype('float32') + for i in xrange(np.product(dims)): + ndarray.ravel()[i] = lod_tensor.get_float_element(i) + return ndarray + + +def train(): + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + if train_conf.parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places) + with pd.do(): + src_word_idx_ = pd.read_input(src_word_idx) + trg_word_idx_ = pd.read_input(trg_word_idx) + label_ = pd.read_input(label) + avg_cost = seq_to_seq_net( + src_word_idx_, + trg_word_idx_, + label_, + embedding_dim=model_conf.embedding_dim, + encoder_size=model_conf.encoder_size, + decoder_size=model_conf.decoder_size, + source_dict_dim=model_conf.source_dict_dim, + target_dict_dim=model_conf.target_dict_dim, + is_generating=model_conf.is_generating, + beam_size=model_conf.beam_size, + max_length=model_conf.max_length) + pd.write_output(avg_cost) + avg_cost = pd() + avg_cost = fluid.layers.mean(x=avg_cost) + else: + avg_cost = seq_to_seq_net( + src_word_idx, + trg_word_idx, + label, + embedding_dim=model_conf.embedding_dim, + encoder_size=model_conf.encoder_size, + decoder_size=model_conf.decoder_size, + source_dict_dim=model_conf.source_dict_dim, + target_dict_dim=model_conf.target_dict_dim, + is_generating=model_conf.is_generating, + beam_size=model_conf.beam_size, + max_length=model_conf.max_length) + + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + # clone from default main program + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=train_conf.learning_rate) + optimizer.minimize(avg_cost) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt16.train(model_conf.source_dict_dim, + model_conf.target_dict_dim), + buf_size=train_conf.buf_size), + batch_size=train_conf.batch_size) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt16.test(model_conf.source_dict_dim, + model_conf.target_dict_dim), + buf_size=train_conf.buf_size), + batch_size=train_conf.batch_size) + + place = core.CUDAPlace(0) if train_conf.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + def do_validation(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] + trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] + + fetch_outs = exe.run(inference_program, + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost], + return_numpy=False) + + total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] + count += 1 + + return total_loss / count + + for pass_id in xrange(train_conf.pass_num): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) + words_seen += word_num + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) + words_seen += word_num + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) + + fetch_outs = exe.run(framework.default_main_program(), + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost]) + + avg_cost_val = np.array(fetch_outs[0]) + print('pass_id=%d, batch_id=%d, train_loss: %f' % + (pass_id, batch_id, avg_cost_val)) + + pass_end_time = time.time() + test_loss = do_validation() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + +def infer(): + pass + + +if __name__ == '__main__': + if train_conf.infer_only: + infer() + else: + train()