diff --git a/sequence_tagging_for_ner/network_conf.py b/sequence_tagging_for_ner/network_conf.py index a2a8f8bea3..7074ae96d8 100644 --- a/sequence_tagging_for_ner/network_conf.py +++ b/sequence_tagging_for_ner/network_conf.py @@ -4,31 +4,7 @@ import paddle.v2.evaluator as evaluator -def stacked_rnn(input_layer, - hidden_size, - hidden_para_attr, - rnn_para_attr, - stack_num=3, - reverse=False): - for i in range(stack_num): - hidden = paddle.layer.fc( - size=hidden_size, - act=paddle.activation.Tanh(), - bias_attr=paddle.attr.Param(initial_std=1.), - input=[input_layer] if not i else [hidden, rnn], - param_attr=[rnn_para_attr] - if not i else [hidden_para_attr, rnn_para_attr]) - - rnn = paddle.layer.recurrent( - input=hidden, - act=paddle.activation.Relu(), - bias_attr=paddle.attr.Param(initial_std=1.), - reverse=reverse, - param_attr=rnn_para_attr) - return hidden, rnn - - -def ner_net(word_dict_len, label_dict_len, stack_num=3, is_train=True): +def ner_net(word_dict_len, label_dict_len, stack_num=2, is_train=True): mark_dict_len = 2 word_dim = 50 mark_dim = 5 @@ -51,37 +27,55 @@ def ner_net(word_dict_len, label_dict_len, stack_num=3, is_train=True): size=mark_dim, param_attr=paddle.attr.Param(initial_std=math.sqrt(1. / word_dim))) - emb_layers = [word_embedding, mark_embedding] - - word_caps_vector = paddle.layer.concat(input=emb_layers) + word_caps_vector = paddle.layer.concat( + input=[word_embedding, mark_embedding]) mix_hidden_lr = 1e-3 rnn_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1) hidden_para_attr = paddle.attr.Param( initial_std=1 / math.sqrt(hidden_dim), learning_rate=mix_hidden_lr) - forward_hidden, rnn_forward = stacked_rnn(word_caps_vector, hidden_dim, - hidden_para_attr, rnn_para_attr) - backward_hidden, rnn_backward = stacked_rnn( - word_caps_vector, - hidden_dim, - hidden_para_attr, - rnn_para_attr, - reverse=True) - - fea = paddle.layer.fc( + # the first rnn layer shares the input-to-hidden mappings. + hidden = paddle.layer.fc( + name="__hidden00__", + size=hidden_dim, + act=paddle.activation.Tanh(), + bias_attr=paddle.attr.Param(initial_std=1.), + input=word_caps_vector, + param_attr=hidden_para_attr) + + fea = [] + for direction in ["fwd", "bwd"]: + for i in range(stack_num): + if i: + hidden = paddle.layer.fc( + name="__hidden%02d_%s__" % (i, direction), + size=hidden_dim, + act=paddle.activation.STanh(), + bias_attr=paddle.attr.Param(initial_std=1.), + input=[hidden, rnn], + param_attr=[hidden_para_attr, rnn_para_attr]) + + rnn = paddle.layer.recurrent( + name="__rnn%02d_%s__" % (i, direction), + input=hidden, + act=paddle.activation.Relu(), + bias_attr=paddle.attr.Param(initial_std=1.), + reverse=i % 2 if direction == "fwd" else not i % 2, + param_attr=rnn_para_attr) + fea += [hidden, rnn] + + rnn_fea = paddle.layer.fc( size=hidden_dim, bias_attr=paddle.attr.Param(initial_std=1.), act=paddle.activation.STanh(), - input=[forward_hidden, rnn_forward, backward_hidden, rnn_backward], - param_attr=[ - hidden_para_attr, rnn_para_attr, hidden_para_attr, rnn_para_attr - ]) + input=fea, + param_attr=[hidden_para_attr, rnn_para_attr] * 2) emission = paddle.layer.fc( size=label_dict_len, bias_attr=False, - input=fea, + input=rnn_fea, param_attr=rnn_para_attr) if is_train: diff --git a/sequence_tagging_for_ner/train.py b/sequence_tagging_for_ner/train.py index aa7f69087d..11a7e5890a 100644 --- a/sequence_tagging_for_ner/train.py +++ b/sequence_tagging_for_ner/train.py @@ -5,6 +5,8 @@ from utils import * from network_conf import * +from paddle.v2.layer import parse_network + def main(train_data_file, test_data_file,