From 44ae44da49f206af56d02816aff8e9b2920d0bf8 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Mon, 14 Aug 2017 09:01:22 +0800 Subject: [PATCH 1/9] add configuratioin helpers. --- python/paddle/trainer/config_parser.py | 16 ++ .../paddle/trainer_config_helpers/layers.py | 34 ++- .../tests/configs/file_list.sh | 2 +- .../test_cross_entropy_over_beam.protostr | 208 ++++++++++++++++++ .../configs/test_cross_entropy_over_beam.py | 39 ++++ 5 files changed, 295 insertions(+), 4 deletions(-) create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index da99e5bd53458..a24299787bfd6 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1602,6 +1602,21 @@ def __init__(self, name, inputs, softmax_selfnorm_alpha=0.1, **xargs): self.config.softmax_selfnorm_alpha = softmax_selfnorm_alpha +@config_layer('cross_entropy_over_beam') +class CrossEntropyOverBeamLayer(LayerBase): + def __init__(self, name, inputs, **xargs): + config_assert(len(inputs) % 3 == 0, "Error input numbers.") + super(CrossEntropyOverBeamLayer, self).__init__( + name, 'cross_entropy_over_beam', 0, inputs, **xargs) + input_num = len(inputs) / 3 + for i in range(input_num): + input_layer = self.get_input_layer(i * 2) + config_assert( + input_layer.size == 1, "Inputs for this layer are made up of " + "several pairs and the first one in a pair is scores for " + "all the candidates, so its size should be equal to 1.") + + @config_layer('fc') class FCLayer(LayerBase): layer_type = 'fc' @@ -2249,6 +2264,7 @@ def init(cls, name, inputs, device=None, coeff=1.): define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy') +define_cost('CrossEntropyOverBeamCostLayer', 'cross_entropy_over_beam') define_cost('RankingCost', 'rank-cost') define_cost('AucValidation', 'auc-validation') define_cost('PnpairValidation', 'pnpair-validation') diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 1bc55c8696015..2b01b6ad4d790 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import functools import collections import inspect @@ -104,6 +103,7 @@ 'nce_layer', 'cross_entropy_with_selfnorm', 'cross_entropy', + 'cross_entropy_over_beam', 'multi_binary_label_cross_entropy', 'sum_cost', 'rank_cost', @@ -219,6 +219,7 @@ class LayerType(object): HUBER = 'huber' CROSS_ENTROPY = 'multi-class-cross-entropy' CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm' + CROSS_ENTROPY_OVER_BEAM = 'cross_entropy_over_beam' SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy' MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy' SUM_COST = 'sum_cost' @@ -4028,8 +4029,12 @@ def __cost_input__(input, label, weight=None): """ inputs and parents for cost layers. """ - ipts = [Input(input.name), Input(label.name)] - parents = [input, label] + if isinstance(input, LayerOutput): + input = [input] + if isinstance(label, LayerOutput): + label = [label] + ipts = [Input(ipt.name) for ipt in (input + label)] + parents = [ipt for ipt in (input + label)] if weight is not None: assert weight.size == 1 ipts.append(Input(weight.name)) @@ -5692,6 +5697,29 @@ def multi_binary_label_cross_entropy(input, size=1) +@wrap_name_default() +@layer_support() +def cross_entropy_over_beam(input, label, name=None, coeff=1.0, weight=None): + """ + TODO(caoying) add comments. + """ + + assert len(input) / 2 == len(label), "Error input numbers." + for i in range(0, len(input), 2): + assert (input[i].size == 1), ( + "Inputs for this layer are made up of " + "several pairs and the first one in a pair is scores for " + "all the candidates, so its size should be equal to 1.") + + ipts, parents = __cost_input__(input, label, weight) + Layer( + name=name, + type=LayerType.CROSS_ENTROPY_OVER_BEAM, + inputs=ipts, + coeff=coeff) + return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1) + + @wrap_name_default() @layer_support() def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None): diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index a61beb871ad06..130e6332a7cf5 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -8,6 +8,6 @@ test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer -test_kmax_seq_socre_layer test_seq_select_layers) +test_kmax_seq_socre_layer test_seq_select_layers test_cross_entropy_over_beam) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr new file mode 100644 index 0000000000000..e44478ec2ba1f --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr @@ -0,0 +1,208 @@ +type: "nn" +layers { + name: "sentence_states" + type: "data" + size: 32 + active_type: "" +} +layers { + name: "sentence_scores" + type: "data" + size: 1 + active_type: "" +} +layers { + name: "__kmax_sequence_score_layer_0__" + type: "kmax_seq_score" + active_type: "" + inputs { + input_layer_name: "sentence_scores" + } + beam_size: 5 +} +layers { + name: "__sub_nested_seq_layer_0__" + type: "sub_nested_seq" + size: 32 + active_type: "" + inputs { + input_layer_name: "sentence_states" + } + inputs { + input_layer_name: "__kmax_sequence_score_layer_0__" + } +} +layers { + name: "__fc_layer_0__" + type: "fc" + size: 1 + active_type: "" + inputs { + input_layer_name: "__sub_nested_seq_layer_0__" + input_parameter_name: "___fc_layer_0__.w0" + } + bias_parameter_name: "___fc_layer_0__.wbias" +} +layers { + name: "__kmax_sequence_score_layer_1__" + type: "kmax_seq_score" + active_type: "" + inputs { + input_layer_name: "sentence_scores" + } + beam_size: 5 +} +layers { + name: "__seq_slice_layer_0__" + type: "seq_slice" + size: 32 + active_type: "" + inputs { + input_layer_name: "__sub_nested_seq_layer_0__" + } + inputs { + input_layer_name: "__kmax_sequence_score_layer_1__" + } + select_first: true +} +layers { + name: "__fc_layer_1__" + type: "fc" + size: 1 + active_type: "" + inputs { + input_layer_name: "__seq_slice_layer_0__" + input_parameter_name: "___fc_layer_1__.w0" + } + bias_parameter_name: "___fc_layer_1__.wbias" +} +layers { + name: "__kmax_sequence_score_layer_2__" + type: "kmax_seq_score" + active_type: "" + inputs { + input_layer_name: "__fc_layer_1__" + } + beam_size: 5 +} +layers { + name: "sentences_ids" + type: "data" + size: 1 + active_type: "" +} +layers { + name: "start_ids" + type: "data" + size: 1 + active_type: "" +} +layers { + name: "end_ids" + type: "data" + size: 1 + active_type: "" +} +layers { + name: "__cross_entropy_over_beam_0__" + type: "cross_entropy_over_beam" + active_type: "" + inputs { + input_layer_name: "sentence_scores" + } + inputs { + input_layer_name: "__kmax_sequence_score_layer_0__" + } + inputs { + input_layer_name: "__fc_layer_0__" + } + inputs { + input_layer_name: "__kmax_sequence_score_layer_1__" + } + inputs { + input_layer_name: "__fc_layer_1__" + } + inputs { + input_layer_name: "__kmax_sequence_score_layer_2__" + } + inputs { + input_layer_name: "sentences_ids" + } + inputs { + input_layer_name: "start_ids" + } + inputs { + input_layer_name: "end_ids" + } + coeff: 1.0 +} +parameters { + name: "___fc_layer_0__.w0" + size: 32 + initial_mean: 0.0 + initial_std: 0.176776695297 + dims: 32 + dims: 1 + initial_strategy: 0 + initial_smart: true +} +parameters { + name: "___fc_layer_0__.wbias" + size: 1 + initial_mean: 0.0 + initial_std: 0.0 + dims: 1 + dims: 1 + initial_strategy: 0 + initial_smart: false +} +parameters { + name: "___fc_layer_1__.w0" + size: 32 + initial_mean: 0.0 + initial_std: 0.176776695297 + dims: 32 + dims: 1 + initial_strategy: 0 + initial_smart: true +} +parameters { + name: "___fc_layer_1__.wbias" + size: 1 + initial_mean: 0.0 + initial_std: 0.0 + dims: 1 + dims: 1 + initial_strategy: 0 + initial_smart: false +} +input_layer_names: "sentence_scores" +input_layer_names: "sentence_states" +input_layer_names: "sentences_ids" +input_layer_names: "start_ids" +input_layer_names: "end_ids" +output_layer_names: "__cross_entropy_over_beam_0__" +sub_models { + name: "root" + layer_names: "sentence_states" + layer_names: "sentence_scores" + layer_names: "__kmax_sequence_score_layer_0__" + layer_names: "__sub_nested_seq_layer_0__" + layer_names: "__fc_layer_0__" + layer_names: "__kmax_sequence_score_layer_1__" + layer_names: "__seq_slice_layer_0__" + layer_names: "__fc_layer_1__" + layer_names: "__kmax_sequence_score_layer_2__" + layer_names: "sentences_ids" + layer_names: "start_ids" + layer_names: "end_ids" + layer_names: "__cross_entropy_over_beam_0__" + input_layer_names: "sentence_scores" + input_layer_names: "sentence_states" + input_layer_names: "sentences_ids" + input_layer_names: "start_ids" + input_layer_names: "end_ids" + output_layer_names: "__cross_entropy_over_beam_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py new file mode 100644 index 0000000000000..edc2d32fca1c9 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +#coding=utf-8 + +from paddle.trainer_config_helpers import * +beam_size = 5 + +# the first beam expansion. +sentence_states = data_layer(name="sentence_states", size=32) +sentence_scores = data_layer(name="sentence_scores", size=1) +topk_sentence_ids = kmax_sequence_score_layer( + input=sentence_scores, beam_size=beam_size) + +# the second beam expansion. +topk_sen = sub_nested_seq_layer( + input=sentence_states, selected_indices=topk_sentence_ids) +start_pos_scores = fc_layer(input=topk_sen, size=1, act=LinearActivation()) +topk_start_pos_ids = kmax_sequence_score_layer( + input=sentence_scores, beam_size=beam_size) + +# the final beam expansion. +topk_start_spans = seq_slice_layer( + input=topk_sen, starts=topk_start_pos_ids, ends=None) +end_pos_scores = fc_layer( + input=topk_start_spans, size=1, act=LinearActivation()) +topk_end_pos_ids = kmax_sequence_score_layer( + input=end_pos_scores, beam_size=beam_size) + +# define the cost +sentence_idx = data_layer(name="sentences_ids", size=1) +start_idx = data_layer(name="start_ids", size=1) +end_idx = data_layer(name="end_ids", size=1) +cost = cross_entropy_over_beam( + input=[ + sentence_scores, topk_sentence_ids, start_pos_scores, + topk_start_pos_ids, end_pos_scores, topk_end_pos_ids + ], + label=[sentence_idx, start_idx, end_idx]) + +outputs(cost) From 05e8a26b4bb093f9dccb9aeb533a5851aaed09b8 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Mon, 14 Aug 2017 10:33:28 +0800 Subject: [PATCH 2/9] add unittest. --- .../gserver/layers/CrossEntropyOverBeam.cpp | 35 +++++++ paddle/gserver/layers/CrossEntropyOverBeam.h | 31 ++++++ paddle/gserver/tests/CMakeLists.txt | 6 ++ paddle/gserver/tests/LayerGradUtil.cpp | 25 +++-- paddle/gserver/tests/LayerGradUtil.h | 18 ++++ .../tests/test_CrossEntropyOverBeamGrad.cpp | 94 +++++++++++++++++++ 6 files changed, 201 insertions(+), 8 deletions(-) create mode 100644 paddle/gserver/layers/CrossEntropyOverBeam.cpp create mode 100644 paddle/gserver/layers/CrossEntropyOverBeam.h create mode 100644 paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp new file mode 100644 index 0000000000000..8b6223ec6a826 --- /dev/null +++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "CrossEntropyOverBeam.h" + +namespace paddle { + +REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam); + +bool CrossEntropyOverBeam::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parent class */ + Layer::init(layerMap, parameterMap); + + setNeedSequenceInfo(false); + + return true; +} + +void CrossEntropyOverBeam::forward(PassType passType) {} + +void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {} + +} // namespace paddle diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h new file mode 100644 index 0000000000000..3106f9858b751 --- /dev/null +++ b/paddle/gserver/layers/CrossEntropyOverBeam.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "CrossEntropyOverBeam.h" +#include "Layer.h" + +namespace paddle { + +class CrossEntropyOverBeam : public Layer { +public: + explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {} + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + void forward(PassType passType) override; + void backward(const UpdateCallback& callback) override; +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index c2a2993620492..24df7e7220b0d 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -34,6 +34,12 @@ add_unittest_without_exec(test_CRFLayerGrad add_test(NAME test_CRFLayerGrad COMMAND test_CRFLayerGrad) +################ test_CrossEntropyOverBeam #################### +add_unittest_without_exec(test_CrossEntropyOverBeam + test_CrossEntropyOverBeamGrad.cpp + LayerGradUtil.cpp) +add_test(NAME test_CrossEntropyOverBeam + COMMAND test_CrossEntropyOverBeam) add_unittest_without_exec(test_ActivationGrad test_ActivationGrad.cpp diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index fd9cfa1dc7a90..a38880e14cdfc 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -388,14 +388,23 @@ void initDataLayer(TestConfig testConf, data.grad->zeroMem(); break; case INPUT_SELF_DEFINE_DATA: { - size_t height = testConf.inputDefs[i].selfDefinedData->getHeight(); - size_t width = testConf.inputDefs[i].selfDefinedData->getWidth(); - CHECK_GT(static_cast(height), 0); - CHECK_GT(static_cast(width), 0); - data.value = Matrix::create(height, width, false, useGpu); - data.grad = Matrix::create(height, width, false, useGpu); - data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData); - data.grad->zeroMem(); + if (testConf.inputDefs[i].ids.size()) { + data.ids = IVector::create(testConf.inputDefs[i].ids.size(), useGpu); + data.ids->copyFrom(testConf.inputDefs[i].ids.data(), + testConf.inputDefs[i].ids.size()); + } else if (testConf.inputDefs[i].selfDefinedData) { + size_t height = testConf.inputDefs[i].selfDefinedData->getHeight(); + size_t width = testConf.inputDefs[i].selfDefinedData->getWidth(); + CHECK_GT(static_cast(height), 0); + CHECK_GT(static_cast(width), 0); + data.value = Matrix::create(height, width, false, useGpu); + data.grad = Matrix::create(height, width, false, useGpu); + data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData); + data.grad->zeroMem(); + } else { + LOG(FATAL) << "No self-defined data are given."; + return; + } const std::vector& labelSeqStartPositions = testConf.inputDefs[i].labelSeqStartPositions; diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index 5debedf5ef6a3..a35edd2b5e21a 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -68,6 +68,7 @@ struct InputDef { std::vector labelInitValue; std::vector labelSeqStartPositions; std::vector labelSubSeqStartPositions; + std::vector ids; MatrixPtr selfDefinedData; InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) { @@ -95,6 +96,23 @@ struct InputDef { isStatic = false; } + InputDef(InputType type, + string nameIn, + std::vector ids, + std::vector selfDefinedSeqStartPos = {}, + std::vector selfDefinedSubSeqStartPos = {}) + : labelSeqStartPositions(selfDefinedSeqStartPos), + labelSubSeqStartPositions(selfDefinedSubSeqStartPos), + ids(ids) { + selfDefinedData = nullptr; + inputType = type; + name = nameIn; + dim = 0; + sparse = {""}; + paraSize = 0; + isStatic = false; + } + InputDef(InputType type, string nameIn, size_t dimIn, diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp new file mode 100644 index 0000000000000..54daba3656ef4 --- /dev/null +++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include "ModelConfig.pb.h" +#include "paddle/gserver/layers/DataLayer.h" +#include "paddle/trainer/Trainer.h" + +#include "LayerGradUtil.h" +#include "paddle/testing/TestUtil.h" + +using namespace paddle; // NOLINT + +DECLARE_int32(gpu_id); +DECLARE_bool(thread_local_rand_use_global_seed); + +struct SingleBeamExpansion { + vector seqStartPos; + vector subSeqStartPos; + + vector candidateScores; + // TODO(caoying): store this into Argument.ids + vector selectedIndices; + vector groundTruth; +}; + +void genRandomBeamExpansion(size_t expansionCount, + vector& beamExpansions) { + beamExpansions.clear(); +} + +void testCrossEntropyOverBeam() { + const size_t expansionCount = 3; + vector beams; + genRandomBeamExpansion(expansionCount, beams); + + for (size_t i = 0; i < beams.size(); ++i) { + const SingleBeamExpansion& beam = beams[i]; + // create scores for all the candidates + MatrixPtr candidateScorePtr = + Matrix::create(beam.candidateScores.size(), 1, false, false); + candidateScorePtr->copyFrom(candidateScores.data(), candidateScores.size()); + + ostringstream paramName; + paramName << "candidate_scores_" << i; + beam.subSeqStartPos.size() + ? config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, + ostr.str(), + candidateScorePtr, + beam.seqStartPos, + beam.subSeqStartPos}) + : config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, + ostr.str(), + candidateScorePtr, + beam.seqStartPos}); + // create indices for the selected candidates + + // create the ground truth + } +} + +TestConfig config; +config.layerConfig.set_type("cross_entropy_over_beam"); + +// testLayerGrad( +// config, "cross_entropy_over_beam", seqNum, false, useGpu, false); +} + +TEST(Layer, CrossEntropyOverBeam) { + for (bool useGpu : {false, true}) testCrossEntropyOverBeam(useGpu); +} + +int main(int argc, char** argv) { + initMain(argc, argv); + hl_start(); + hl_init(FLAGS_gpu_id); + FLAGS_thread_local_rand_use_global_seed = true; + srand(1); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} From a037b099f7f4bf8370e882f397bd4c691b0e0986 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Mon, 14 Aug 2017 15:49:48 +0800 Subject: [PATCH 3/9] finish unittest. --- .../gserver/layers/CrossEntropyOverBeam.cpp | 1 + .../tests/test_CrossEntropyOverBeamGrad.cpp | 218 +++++++++++++++--- 2 files changed, 191 insertions(+), 28 deletions(-) diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp index 8b6223ec6a826..88d80aa83af5c 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp +++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp @@ -22,6 +22,7 @@ bool CrossEntropyOverBeam::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { /* Initialize the basic parent class */ Layer::init(layerMap, parameterMap); + CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number."; setNeedSequenceInfo(false); diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp index e9ecebcfe5204..a5f06c15dc480 100644 --- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp +++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include @@ -27,6 +28,10 @@ using namespace paddle; // NOLINT DECLARE_int32(gpu_id); DECLARE_bool(thread_local_rand_use_global_seed); +const size_t MAX_SEQ_NUM = 10; +const size_t MAX_SEQ_LEN = 27; +const size_t MAX_BEAM_SIZE = 10; + struct SingleBeamExpansion { vector seqStartPos; vector subSeqStartPos; @@ -34,37 +39,195 @@ struct SingleBeamExpansion { // TODO(caoying): store this into Argument.ids vector selectedIndices; + vector groundTruth; - vector labelSeqStartPos; + vector inBeam; + vector rowIdxInBeam; }; -void genCandidateScores(bool hasSubSeq, - vector& scores, +void genRand(real* numbers, size_t n) { + default_random_engine generator; + uniform_real_distribution distribution(0.0, 1.0); + for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator); +} + +vector randSampling(real range, int n) { + CHECK_GE(range, n); + vector num(range); + iota(begin(num), end(num), 0.); + if (range == n) return num; + + random_shuffle(begin(num), end(num)); + num.resize(n); + sort(begin(num), end(num)); + return num; +} + +void genCandidateScores(bool hasSubseq, + size_t beamSize, + SingleBeamExpansion& prevBeam, + SingleBeamExpansion& curBeam) { + vector& seqStartPos = curBeam.seqStartPos; + seqStartPos.resize(1, 0); + vector& subSeqStartPos = curBeam.subSeqStartPos; + subSeqStartPos.resize(1, 0); + + srand((size_t)(time(NULL))); + // srand(1); + if (prevBeam.selectedIndices.size()) { + if (prevBeam.subSeqStartPos.size() > 1) { + int seqIdx = 1; + // samples in previous beam are nested sequences. + for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) { + for (size_t j = 0; j < beamSize; ++j) { + if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break; + for (size_t k = 0; k < beamSize; ++k) + subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) + + subSeqStartPos.back()); + } + if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) { + seqStartPos.push_back(subSeqStartPos.back()); + seqIdx++; + } + } + } else { + // samples in previous beam are sequences. + for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) { + if (i && i % beamSize == 0) { + seqStartPos.push_back(subSeqStartPos.back()); + if (i == prevBeam.selectedIndices.size()) break; + } + if (prevBeam.selectedIndices[i] == -1.) continue; + subSeqStartPos.push_back(subSeqStartPos.back() + + (1 + (rand() % MAX_SEQ_LEN))); + } + } + } else { + // the first beam expansion + int seqNum = 1 + (rand() % MAX_SEQ_NUM); + for (int i = 0; i < seqNum; ++i) { + if (hasSubseq) { + for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j) + subSeqStartPos.push_back(subSeqStartPos.back() + + (1 + (rand() % MAX_SEQ_LEN))); + seqStartPos.push_back(subSeqStartPos.back()); + } else { + seqStartPos.push_back(seqStartPos.back() + + (1 + (rand() % MAX_SEQ_LEN))); + } + } + } + + size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back(); + curBeam.candidateScores.resize(totalSeqNum, 0.); + genRand(curBeam.candidateScores.data(), totalSeqNum); +} + +void genSelectedIndices(size_t beamSize, vector& seqStartPos, - vector& subSeqStartPos) {} - -void genSelectedIndicesAndGroundtruth(size_t beamSize, - vector& seqStartPos, - vector& selectedIndices) {} - -SingleBeamExpansion genOneBeam(size_t beamSize, bool hasSubSeq) { - SingleBeamExpansion beam; - genCandidateScores( - hasSubSeq, beam.candidateScores, beam.seqStartPos, beam.subSeqStartPos); - genSelectedIndicesAndGroundtruth( - beamSize, - hasSubSeq ? beam.subSeqStartPos : beam.seqStartPos, - beam.selectedIndices); - return beam; + vector& selectedIndices) { + size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1); + selectedIndices.resize(selectedIdsCount, -1.); + + for (size_t i = 0; i < seqStartPos.size() - 1; ++i) { + int seqLen = seqStartPos[i + 1] - seqStartPos[i]; + int n = min(seqLen, static_cast(beamSize)); + vector ids = randSampling(seqLen, n); + memcpy(selectedIndices.data() + i * beamSize, + ids.data(), + sizeof(real) * ids.size()); + } +} + +void genGroundTruth(vector& beamExpansions, + size_t beamSize) { + size_t seqNum = beamExpansions[1].seqStartPos.size() - 1; + for (size_t i = 2; i < beamExpansions.size(); ++i) + CHECK_EQ(seqNum, beamExpansions[i - 1].seqStartPos.size() - 1); + + // srand(1); + srand((size_t)(time(NULL))); + + // initialize the first beam. + SingleBeamExpansion& beam = beamExpansions[1]; + beam.groundTruth.resize(seqNum, 0); + beam.inBeam.resize(seqNum, 0); + beam.rowIdxInBeam.resize(seqNum, -1); + + auto begPos = beam.selectedIndices.begin(); + for (size_t i = 0; i < seqNum; ++i) { + int seqLen = beam.seqStartPos[i + 1] - beam.seqStartPos[i]; + int label = rand() % seqLen; + auto endPos = begPos + beamSize; + beam.groundTruth[i] = label; + if (find(begPos, endPos, real(label)) != endPos) beam.inBeam[i] = 1; + begPos = endPos; + beam.rowIdxInBeam[i] = i; + } + + // iterate over each beam expansions + for (size_t i = 2; i < beamExpansions.size(); ++i) { + SingleBeamExpansion& curBeam = beamExpansions[i]; + SingleBeamExpansion& prevBeam = beamExpansions[i - 1]; + + curBeam.groundTruth.resize(seqNum, 0); + curBeam.inBeam.resize(seqNum, 0); + curBeam.rowIdxInBeam.resize(seqNum, -1); + + // iterate over each sequence + for (size_t j = 0; j < seqNum; ++j) { + if (prevBeam.inBeam[j]) { + // gold sequence falls in the beam in previous search. + + auto begPos = prevBeam.selectedIndices.begin(); + auto endPos = begPos + prevBeam.rowIdxInBeam[j] * beamSize; + size_t totalExpansion = + prevBeam.rowIdxInBeam[j] * beamSize - count(begPos, endPos, -1.); + curBeam.rowIdxInBeam[j] = totalExpansion + prevBeam.groundTruth[j]; + + CHECK_LE(curBeam.rowIdxInBeam[j] + 1, + curBeam.subSeqStartPos.size() - 1); + int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]]; + int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1]; + CHECK_GT(size_t(end), size_t(start)); + int label = rand() % (end - start); + + curBeam.groundTruth[j] = label; + auto findBeg = curBeam.selectedIndices.begin() + + curBeam.rowIdxInBeam[j] * beamSize; + auto findEnd = findBeg + beamSize; + if (find(findBeg, findEnd, real(label)) != findEnd) + curBeam.inBeam[j] = 1; + } else { + // in previous search, gold sequence has fallen off the beam, + // the beam search stops, here use -1 as a dummy label. + // It will not used in calculation the cost. + beamExpansions[i].groundTruth[j] = -1; + } + } + } +} + +void genOneBeam(size_t beamSize, + bool hasSubseq, + SingleBeamExpansion& prevBeam, + SingleBeamExpansion& curBeam) { + genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam); + genSelectedIndices(beamSize, + hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos, + curBeam.selectedIndices); } void genRandomBeamExpansion(size_t expansionCount, size_t beamSize, vector& beamExpansions) { beamExpansions.clear(); - for (size_t i = 0; i < expansionCount; ++i) { - beamExpansions.emplace_back(genOneBeam(beamSize, i)); - } + beamExpansions.resize(expansionCount + 1); + + // beamExpansions[0] is reserved. + for (size_t i = 1; i <= expansionCount; ++i) + genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]); + genGroundTruth(beamExpansions, beamSize); } void testCrossEntropyOverBeam(bool useGpu) { @@ -72,12 +235,12 @@ void testCrossEntropyOverBeam(bool useGpu) { config.layerConfig.set_type("cross_entropy_over_beam"); const size_t expansionCount = 3; - const size_t beamSize = 3; + const size_t beamSize = MAX_BEAM_SIZE; vector beams; genRandomBeamExpansion(expansionCount, beamSize, beams); size_t seqNum = 0; - for (size_t i = 0; i < beams.size(); ++i) { + for (size_t i = 1; i < beams.size(); ++i) { const SingleBeamExpansion& beam = beams[i]; // create scores for all the candidates MatrixPtr candidateScorePtr = @@ -88,7 +251,7 @@ void testCrossEntropyOverBeam(bool useGpu) { ostringstream paramName; paramName << "candidate_scores_" << i; - if (beam.subSeqStartPos.size()) { + if (beam.subSeqStartPos.size() > 1) { seqNum = beam.subSeqStartPos.size() - 1; config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, paramName.str(), @@ -118,10 +281,9 @@ void testCrossEntropyOverBeam(bool useGpu) { // create the ground truth paramName.clear(); paramName << "label_" << i; - config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, - paramName.str(), - beam.groundTruth, - beam.labelSeqStartPos}); + config.inputDefs.push_back( + {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth}); + config.layerConfig.add_inputs(); } testLayerGrad( From 8f4ca2d12fffe38d5adff0ad74db6ba1bdc0d223 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Wed, 16 Aug 2017 15:34:02 +0800 Subject: [PATCH 4/9] add implementations. --- .../gserver/layers/CrossEntropyOverBeam.cpp | 344 +++++++++++++++++- paddle/gserver/layers/CrossEntropyOverBeam.h | 98 +++++ .../tests/test_CrossEntropyOverBeamGrad.cpp | 166 ++++++--- 3 files changed, 549 insertions(+), 59 deletions(-) diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp index 88d80aa83af5c..09258fb305990 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp +++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp @@ -16,6 +16,168 @@ limitations under the License. */ namespace paddle { +void CostForOneSequence::calValidExpandStep() { + validExpansionCount_ = 0; + goldAsExtraPath_ = true; + + for (size_t i = 0; i < beams_->expansionCount; ++i) { + real gold = static_cast(beams_->gold[i]); + if (i) { + real* start = beams_->candidateIds[i - 1]->getData(); + goldRowIds_[i] = std::count_if( + start, + start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1], + [](const real& val) { return val != -1.; }); + } else + goldRowIds_[i] = 0; + + real* start = + beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_; + real* findEnd = std::find(start, start + beamSize_, gold); + validExpansionCount_++; + + if (start + beamSize_ == findEnd) return; + goldColIds_[i] = findEnd - start; + } + + if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false; +} + +size_t CostForOneSequence::initLastExpansion() { + int beamId = validExpansionCount_ - 1; + const MatrixPtr candidates = beams_->candidateIds[beamId]; + size_t height = candidates->getHeight(); + + /* initialization the last expansion. */ + size_t pathCount = std::count_if(candidates->getData(), + candidates->getData() + height * beamSize_, + [](const real& val) { return val != -1; }); + /* + * if the gold sequence falls off the beam during search, + * add the gold sequence as the last path into all expanded paths. + */ + if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++; + + pathRowIdsInEachBeam_.clear(); + pathRowIdsInEachBeam_.resize(validExpansionCount_, + std::vector(pathCount, 0)); + parentIdsInBeam_.clear(); + parentIdsInBeam_.resize(pathCount, 0); + + if (goldAsExtraPath_) { + /* add gold sequence into the total expansion. */ + pathRowIdsInEachBeam_[beamId].back() = + beams_->gold[beamId] + + getSeqStartPos(beamId, goldRowIds_[validExpansionCount_ - 1]); + parentIdsInBeam_.back() = goldRowIds_[validExpansionCount_ - 1]; + } else { + size_t goldOffset = goldRowIds_[beamId] * beamSize_ + goldColIds_[beamId]; + goldIdsInFinalExpansion_ = + std::count_if(candidates->getData(), + candidates->getData() + goldOffset, + [](const real& val) { return val != -1.; }); + } + + /* + * TODO(caoying): fix this, store the indices of selected candidate + * paths into Argument.ids + */ + real* ids = candidates->getData(); + size_t curIdx = 0; + for (size_t i = 0; i < height; ++i) { + int basePos = getSeqStartPos(beamId, i); + for (size_t j = 0; j < beamSize_; ++j) { + int id = ids[i * beamSize_ + j]; + if (id == -1) continue; + pathRowIdsInEachBeam_[beamId][curIdx] = id + basePos; + parentIdsInBeam_[curIdx++] = i; + } + } + return pathCount; +} + +void CostForOneSequence::constructTotalExpansion() { + /* + * construct the entire expanded beam by begining with the last search + * in which gold falls off the beam. + */ + size_t totalPathCount = initLastExpansion(); + + for (int beamId = validExpansionCount_ - 2; beamId >= 0; --beamId) { + const MatrixPtr candidates = beams_->candidateIds[beamId]; + real* ids = candidates->getData(); + + int lastParentIdInBeam = -1; + int basePos = -1; + for (size_t i = 0; + i < (goldAsExtraPath_ ? totalPathCount - 1 : totalPathCount); + ++i) { + int id = ids[parentIdsInBeam_[i]]; + int parentRowId = std::div(parentIdsInBeam_[i], beamSize_).quot; + if (parentIdsInBeam_[i] != lastParentIdInBeam) + basePos = getSeqStartPos(beamId, parentRowId); + + pathRowIdsInEachBeam_[beamId][i] = id + basePos; + lastParentIdInBeam = parentIdsInBeam_[i]; + parentIdsInBeam_[i] = parentRowId; + + if (goldAsExtraPath_) + pathRowIdsInEachBeam_[beamId][totalPathCount - 1] = + beams_->gold[beamId] + getSeqStartPos(beamId, goldRowIds_[beamId]); + } + } +} + +real CostForOneSequence::globallyNormalizedScore() { + expandedPathScores_.resize(validExpansionCount_); + + Matrix::resizeOrCreate( + softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false); + softmaxOut_->zero(); + MatrixPtr tmp = Matrix::create( + softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false); + + for (size_t i = 0; i < validExpansionCount_; ++i) { + Matrix::resizeOrCreate(expandedPathScores_[i], + pathRowIdsInEachBeam_[i].size(), + 1, + false, + false); + IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(), + pathRowIdsInEachBeam_[i].size(), + false); + expandedPathScores_[i]->selectRows(*(beams_->scores[i]), *rowIds); + tmp->add(*expandedPathScores_[i]); + } + + softmaxOut_->softmax(*softmaxOut_); + return -std::log(softmaxOut_->getData()[goldIdsInFinalExpansion_]); +} + +real CostForOneSequence::forward() { + calValidExpandStep(); + constructTotalExpansion(); + return globallyNormalizedScore(); +} + +void CostForOneSequence::backward() { + softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.; + MatrixPtr tmp = Matrix::create( + softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false); + + for (size_t i = 0; i < validExpansionCount_; ++i) { + IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(), + pathRowIdsInEachBeam_[i].size(), + false); + /* + beams_->scoreGrad[i] has been intialized outside this class, this + class only keeps a pointer pointing to the original input gradients, + so here does not need to allocate or initalize the memory. + */ + tmp->addToRows(*beams_->scoreGrad[i], *rowIds); + } +} + REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam); bool CrossEntropyOverBeam::init(const LayerMap& layerMap, @@ -24,13 +186,189 @@ bool CrossEntropyOverBeam::init(const LayerMap& layerMap, Layer::init(layerMap, parameterMap); CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number."; - setNeedSequenceInfo(false); + beamExpanCount_ = inputLayers_.size() / 3; + + candidateScores_.resize(beamExpanCount_); + candidateScoreGrad_.resize(beamExpanCount_); + candidateInBeam_.resize(beamExpanCount_); + goldSequence_.resize(beamExpanCount_); + gradToInputs_.resize(beamExpanCount_); + + setNeedSequenceInfo(false); return true; } -void CrossEntropyOverBeam::forward(PassType passType) {} +void CrossEntropyOverBeam::checkInputs() { + batchSize_ = 0; + for (size_t i = 0; i < beamExpanCount_; ++i) { + const Argument& scores = getInput(i * 3); + const Argument& selCandidates = getInput(i * 3 + 1); + const Argument& goldSeq = getInput(i * 3 + 2); + + if (i) { + CHECK(scores.hasSubseq()) << "Beam expansion expect the first one, " + "should be a nested sequence"; + CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_); + CHECK_EQ(scores.getNumSequences(), batchSize_); + CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize()); + } else { + CHECK(scores.hasSeq()) << "The first beam expansion should be a sequence"; + batchSize_ = scores.getNumSequences(); + beamSize_ = getInputValue(i * 3 + 1)->getWidth(); + CHECK_EQ(batchSize_, selCandidates.getBatchSize()); + } + CHECK_EQ(1U, scores.value->getWidth()); + CHECK_EQ(batchSize_, goldSeq.getBatchSize()); + } +} + +void CrossEntropyOverBeam::copyInputsToCpu() { + auto copyValue = [](const MatrixPtr& src, MatrixPtr& trg) { + if (dynamic_cast(src.get())) { + Matrix::resizeOrCreate( + trg, src->getHeight(), src->getWidth(), false, false); + trg->copyFrom(*src); + } else { + trg = std::move(src); + } + }; + + auto copyIds = [](const IVectorPtr& src, IVectorPtr& trg) { + if (dynamic_cast(src.get())) { + IVector::resizeOrCreate(trg, src->getSize(), false); + trg->copyFrom(*src); + } else { + trg = std::move(src); + } + }; + + beamSplitPos_.clear(); + beamSplitPos_.resize(batchSize_, std::vector(beamExpanCount_, 0)); + for (size_t i = 0; i < beamExpanCount_; ++i) { + copyValue(getInputValue(i * 3), candidateScores_[i]); + copyValue(getInputValue(i * 3 + 1), candidateInBeam_[i]); + copyIds(getInput(i * 3 + 2).ids, goldSequence_[i]); + + if (i) { + ICpuGpuVectorPtr seqInfo = getInput(i * 3).sequenceStartPositions; + const int* seqStarts = seqInfo->getMutableData(false); + ICpuGpuVectorPtr subSeqInfo = getInput(i * 3).subSequenceStartPositions; + const int* subSeqStarts = subSeqInfo->getMutableData(false); + + size_t seqId = 1; + for (size_t subSeqId = 0; subSeqId < subSeqInfo->getSize() - 1; + ++subSeqId) { + CHECK_LT(seqId, seqInfo->getSize()); + if (subSeqStarts[subSeqId] == seqStarts[seqId]) { + beamSplitPos_[seqId][i] = beamSplitPos_[seqId - 1][i]; + seqId++; + } + beamSplitPos_[seqId - 1][i]++; + } + } else { + for (size_t j = 0; j < batchSize_; ++j) beamSplitPos_[j][i] = j + 1; + } + } +} + +void CrossEntropyOverBeam::splitBatchBeams() { + beamCosts_.resize(batchSize_); + beamPerSeq_.resize(batchSize_, beamExpanCount_); + + for (size_t i = 0; i < beamExpanCount_; ++i) { + int* seqStarts = + getInput(i * 3).sequenceStartPositions->getMutableData(false); + + int* subSeqStarts = nullptr; + int maxLen = 0; + if (i) { + subSeqStarts = + getInput(i * 3).subSequenceStartPositions->getMutableData(false); + maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1; + } else + maxLen = getInput(i).sequenceStartPositions->getSize() - 1; + + for (size_t j = 0; j < batchSize_; ++j) { + beamPerSeq_[j].scores[i] = + Matrix::create(candidateScores_[i]->getData() + seqStarts[j], + seqStarts[j + 1] - seqStarts[j], + 1, + false, + false); + beamPerSeq_[j].scoreGrad[i] = + Matrix::create(candidateScoreGrad_[i]->getData() + seqStarts[j], + seqStarts[j + 1] - seqStarts[j], + 1, + false, + false); + + int offset = j ? beamSplitPos_[j - 1][i] : 0; + int height = beamSplitPos_[j][i] - (j ? beamSplitPos_[j - 1][i] : 0); + CHECK_GE(maxLen, offset + height); + beamPerSeq_[j].seqInfo[i] = IVector::create( + (i ? subSeqStarts : seqStarts) + offset, height + 1, false); -void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {} + beamPerSeq_[j].candidateIds[i] = + Matrix::create(candidateInBeam_[i]->getData() + offset * beamSize_, + height, + beamSize_, + false, + false); + beamPerSeq_[j].gold[i] = goldSequence_[i]->getData()[j]; + } + } +} + +void CrossEntropyOverBeam::resizeOutput() { + Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false); + output_.value->zero(); + + for (size_t i = 0; i < beamExpanCount_; ++i) { + MatrixPtr inGrad = getInputGrad(i * 3); + if (dynamic_cast(inGrad.get())) { + Matrix::resizeOrCreate(candidateScoreGrad_[i], + inGrad->getHeight(), + inGrad->getWidth(), + false, + false); + } else + candidateScoreGrad_[i] = std::move(inGrad); + candidateScoreGrad_[i]->zero(); + } +} + +void CrossEntropyOverBeam::copyGradToGpu(size_t copyCount) { + for (size_t i = 0; i < beamExpanCount_; ++i) { + if (dynamic_cast(getInputGrad(i * 3).get())) + getInputGrad(i * 3)->copyFrom(*candidateScoreGrad_[i]); + + if (i == copyCount - 1) break; + } +} + +void CrossEntropyOverBeam::forward(PassType passType) { + Layer::forward(passType); + + checkInputs(); + copyInputsToCpu(); + + resizeOutput(); + splitBatchBeams(); + + MatrixPtr outputValue = getOutputValue(); + for (size_t i = 0; i < batchSize_; ++i) { + beamCosts_[i].setData( + std::move(std::make_shared(beamPerSeq_[i])), beamSize_); + outputValue->getData()[i] = beamCosts_[i].forward(); + } +} + +void CrossEntropyOverBeam::backward(const UpdateCallback& callback) { + for (size_t i = 0; i < batchSize_; ++i) { + beamCosts_[i].backward(); + copyGradToGpu(beamCosts_[i].getValidExpansionCount()); + } +} } // namespace paddle diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h index 3106f9858b751..96a5df7dfbe46 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.h +++ b/paddle/gserver/layers/CrossEntropyOverBeam.h @@ -19,6 +19,79 @@ limitations under the License. */ namespace paddle { +struct BeamExpansion { + // store the entire beam expansion for a single sequence + std::vector scores; + std::vector seqInfo; + + std::vector candidateIds; + std::vector gold; + + std::vector scoreGrad; + + size_t expansionCount; + + BeamExpansion(int n) { + expansionCount = n; + scores.resize(expansionCount); + seqInfo.resize(expansionCount); + candidateIds.resize(expansionCount); + scoreGrad.resize(expansionCount); + + gold.resize(expansionCount); + }; +}; +typedef std::shared_ptr BeamExpansionPtr; + +class CostForOneSequence { +public: + CostForOneSequence() + : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {} + void setData(const BeamExpansionPtr bPtr, size_t beamSize) { + beams_ = bPtr; + beamSize_ = beamSize; + + expandedPathScores_.clear(); + expandedPathScores_.resize(beams_->expansionCount); + + goldRowIds_.clear(); + goldRowIds_.resize(beams_->expansionCount, 0); + goldColIds_.clear(); + goldColIds_.resize(beams_->expansionCount, -1); + } + size_t getValidExpansionCount() { return validExpansionCount_; } + + real forward(); + void backward(); + +private: + void calValidExpandStep(); + void constructTotalExpansion(); + size_t initLastExpansion(); + real globallyNormalizedScore(); + + int getSeqStartPos(size_t beamId, size_t rowId) { + CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId); + int* starts = beams_->seqInfo[beamId]->getData(); + return starts[rowId] - starts[0]; + }; + + size_t beamSize_; + size_t validExpansionCount_; + bool goldAsExtraPath_; + std::vector goldRowIds_; + std::vector goldColIds_; + + BeamExpansionPtr beams_; + std::vector> pathRowIdsInEachBeam_; + std::vector parentIdsInBeam_; + size_t goldIdsInFinalExpansion_; + + std::vector expandedPathScores_; + + MatrixPtr softmaxOut_; +}; + class CrossEntropyOverBeam : public Layer { public: explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {} @@ -26,6 +99,31 @@ class CrossEntropyOverBeam : public Layer { const ParameterMap& parameterMap) override; void forward(PassType passType) override; void backward(const UpdateCallback& callback) override; + +private: + void checkInputs(); + void copyInputsToCpu(); + void resizeOutput(); + void copyGradToGpu(size_t copyCount); + void splitBatchBeams(); + + size_t beamExpanCount_; + size_t batchSize_; + size_t beamSize_; + + // Currently, this layer only works on CPU, if its inputs is on GPU, + // copy them to CPU memory. + std::vector candidateScores_; + std::vector candidateScoreGrad_; + std::vector candidateInBeam_; + std::vector gradToInputs_; + std::vector goldSequence_; + std::vector> beamSplitPos_; + + // split entire bath of beams into beam per sequnence. + std::vector beamPerSeq_; + // beamCosts_ is used to propagate error in one sequence. + std::vector beamCosts_; }; } // namespace paddle diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp index a5f06c15dc480..506a4281df4f0 100644 --- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp +++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp @@ -28,9 +28,17 @@ using namespace paddle; // NOLINT DECLARE_int32(gpu_id); DECLARE_bool(thread_local_rand_use_global_seed); -const size_t MAX_SEQ_NUM = 10; -const size_t MAX_SEQ_LEN = 27; -const size_t MAX_BEAM_SIZE = 10; +// const size_t MAX_SEQ_NUM = 5; +// const size_t MAX_SEQ_LEN = 10; +// const size_t MAX_BEAM_SIZE = 3; + +const size_t MAX_SEQ_NUM = 23; +const size_t MAX_SEQ_LEN = 50; +const size_t MAX_BEAM_SIZE = 27; + +// const size_t SEED = 1503391792; +// const size_t SEED = 1; +const size_t SEED = (size_t)(time(NULL)); struct SingleBeamExpansion { vector seqStartPos; @@ -43,11 +51,30 @@ struct SingleBeamExpansion { vector groundTruth; vector inBeam; vector rowIdxInBeam; + vector colIdxInBeam; + + void resetGroundTruth(size_t n) { + groundTruth.clear(); + groundTruth.resize(n, -1); + + inBeam.clear(); + inBeam.resize(n, 0); + + rowIdxInBeam.clear(); + rowIdxInBeam.resize(n, -1); + + colIdxInBeam.clear(); + colIdxInBeam.resize(n, -1); + } }; +inline float randFloat() { + return static_cast(rand()) / static_cast(RAND_MAX); +} + void genRand(real* numbers, size_t n) { default_random_engine generator; - uniform_real_distribution distribution(0.0, 1.0); + uniform_real_distribution distribution(0.0, 1.0); for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator); } @@ -72,8 +99,7 @@ void genCandidateScores(bool hasSubseq, vector& subSeqStartPos = curBeam.subSeqStartPos; subSeqStartPos.resize(1, 0); - srand((size_t)(time(NULL))); - // srand(1); + srand(SEED); if (prevBeam.selectedIndices.size()) { if (prevBeam.subSeqStartPos.size() > 1) { int seqIdx = 1; @@ -81,9 +107,8 @@ void genCandidateScores(bool hasSubseq, for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) { for (size_t j = 0; j < beamSize; ++j) { if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break; - for (size_t k = 0; k < beamSize; ++k) - subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) + - subSeqStartPos.back()); + subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) + + subSeqStartPos.back()); } if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) { seqStartPos.push_back(subSeqStartPos.back()); @@ -91,7 +116,6 @@ void genCandidateScores(bool hasSubseq, } } } else { - // samples in previous beam are sequences. for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) { if (i && i % beamSize == 0) { seqStartPos.push_back(subSeqStartPos.back()); @@ -141,27 +165,41 @@ void genSelectedIndices(size_t beamSize, void genGroundTruth(vector& beamExpansions, size_t beamSize) { - size_t seqNum = beamExpansions[1].seqStartPos.size() - 1; + SingleBeamExpansion& beam = beamExpansions[1]; + size_t seqNum = beam.seqStartPos.size() - 1; for (size_t i = 2; i < beamExpansions.size(); ++i) - CHECK_EQ(seqNum, beamExpansions[i - 1].seqStartPos.size() - 1); + CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1); - // srand(1); - srand((size_t)(time(NULL))); + srand(SEED); // initialize the first beam. - SingleBeamExpansion& beam = beamExpansions[1]; - beam.groundTruth.resize(seqNum, 0); - beam.inBeam.resize(seqNum, 0); - beam.rowIdxInBeam.resize(seqNum, -1); - - auto begPos = beam.selectedIndices.begin(); + beam.resetGroundTruth(seqNum); for (size_t i = 0; i < seqNum; ++i) { - int seqLen = beam.seqStartPos[i + 1] - beam.seqStartPos[i]; - int label = rand() % seqLen; - auto endPos = begPos + beamSize; - beam.groundTruth[i] = label; - if (find(begPos, endPos, real(label)) != endPos) beam.inBeam[i] = 1; - begPos = endPos; + if (randFloat() > 0.5) { + // force the randomly generated label falls in the beam by chance 0.5. + // otherwise, when sequence length is relatively long and beam size is + // relatively small, the gold sequences falls off the beam at in + // the first search. + real* begPos = beam.selectedIndices.data() + i * beamSize; + beam.colIdxInBeam[i] = + rand() % count_if(begPos, begPos + beamSize, [](const real& val) { + return val != -1.; + }); + beam.groundTruth[i] = + beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]]; + beam.inBeam[i] = 1; + } else { + int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]); + beam.groundTruth[i] = label; + + real* begPos = beam.selectedIndices.data() + i * beamSize; + real* endPos = begPos + beamSize; + real* lblPos = find(begPos, endPos, real(label)); + if (lblPos != endPos) { + beam.inBeam[i] = 1; + beam.colIdxInBeam[i] = lblPos - begPos; + } + } beam.rowIdxInBeam[i] = i; } @@ -169,22 +207,33 @@ void genGroundTruth(vector& beamExpansions, for (size_t i = 2; i < beamExpansions.size(); ++i) { SingleBeamExpansion& curBeam = beamExpansions[i]; SingleBeamExpansion& prevBeam = beamExpansions[i - 1]; - - curBeam.groundTruth.resize(seqNum, 0); - curBeam.inBeam.resize(seqNum, 0); - curBeam.rowIdxInBeam.resize(seqNum, -1); + curBeam.resetGroundTruth(seqNum); // iterate over each sequence for (size_t j = 0; j < seqNum; ++j) { - if (prevBeam.inBeam[j]) { - // gold sequence falls in the beam in previous search. - - auto begPos = prevBeam.selectedIndices.begin(); - auto endPos = begPos + prevBeam.rowIdxInBeam[j] * beamSize; - size_t totalExpansion = - prevBeam.rowIdxInBeam[j] * beamSize - count(begPos, endPos, -1.); - curBeam.rowIdxInBeam[j] = totalExpansion + prevBeam.groundTruth[j]; - + if (!prevBeam.inBeam[j]) continue; + + // gold sequence falls in the beam in previous search. + real* begPos = prevBeam.selectedIndices.data(); + int offset = + prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j]; + curBeam.rowIdxInBeam[j] = count_if( + begPos, begPos + offset, [](const real& val) { return val != -1.; }); + + if (randFloat() > 0.5) { + // force the randomly generated label falls in the beam by chance 0.5. + // otherwise, when sequence length is relatively long and beam size is + // relatively small, the gold sequences falls off the beam at in + // the first search. + real* start = + curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize; + int n = rand() % count_if(start, start + beamSize, [](const real& val) { + return val != -1.; + }); + curBeam.colIdxInBeam[j] = n; + curBeam.groundTruth[j] = *(start + n); + curBeam.inBeam[j] = 1; + } else { CHECK_LE(curBeam.rowIdxInBeam[j] + 1, curBeam.subSeqStartPos.size() - 1); int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]]; @@ -193,16 +242,14 @@ void genGroundTruth(vector& beamExpansions, int label = rand() % (end - start); curBeam.groundTruth[j] = label; - auto findBeg = curBeam.selectedIndices.begin() + - curBeam.rowIdxInBeam[j] * beamSize; - auto findEnd = findBeg + beamSize; - if (find(findBeg, findEnd, real(label)) != findEnd) + real* findBeg = + curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize; + real* lblPos = + find(findBeg, findBeg + beamSize, static_cast(label)); + if (lblPos != (findBeg + beamSize)) { curBeam.inBeam[j] = 1; - } else { - // in previous search, gold sequence has fallen off the beam, - // the beam search stops, here use -1 as a dummy label. - // It will not used in calculation the cost. - beamExpansions[i].groundTruth[j] = -1; + curBeam.colIdxInBeam[j] = lblPos - findBeg; + } } } } @@ -230,15 +277,12 @@ void genRandomBeamExpansion(size_t expansionCount, genGroundTruth(beamExpansions, beamSize); } -void testCrossEntropyOverBeam(bool useGpu) { +void testCrossEntropyOverBeam(bool useGpu, + size_t beamSize, + vector& beams) { TestConfig config; config.layerConfig.set_type("cross_entropy_over_beam"); - const size_t expansionCount = 3; - const size_t beamSize = MAX_BEAM_SIZE; - vector beams; - genRandomBeamExpansion(expansionCount, beamSize, beams); - size_t seqNum = 0; for (size_t i = 1; i < beams.size(); ++i) { const SingleBeamExpansion& beam = beams[i]; @@ -291,7 +335,17 @@ void testCrossEntropyOverBeam(bool useGpu) { } TEST(Layer, CrossEntropyOverBeam) { - for (bool useGpu : {false, true}) testCrossEntropyOverBeam(useGpu); + LOG(INFO) << "SEED = " << SEED; + const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE; + LOG(INFO) << "beamSize = " << beamSize; + + // TODO(caoying): test with more beam expansions. + const size_t expansionCount = 3; + vector beams; + genRandomBeamExpansion(expansionCount, beamSize, beams); + + for (bool useGpu : {false, true}) + testCrossEntropyOverBeam(useGpu, beamSize, beams); } int main(int argc, char** argv) { @@ -299,7 +353,7 @@ int main(int argc, char** argv) { hl_start(); hl_init(FLAGS_gpu_id); FLAGS_thread_local_rand_use_global_seed = true; - srand(1); + srand(SEED); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } From 5e59ca7ccc8232b2028cfc8b4cffe19ffc73ba18 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Wed, 23 Aug 2017 10:40:46 +0800 Subject: [PATCH 5/9] fix config helper. --- .../gserver/layers/CrossEntropyOverBeam.cpp | 10 ++ paddle/gserver/layers/CrossEntropyOverBeam.h | 16 ++- .../tests/test_CrossEntropyOverBeamGrad.cpp | 22 ++- python/paddle/trainer/config_parser.py | 12 +- .../paddle/trainer_config_helpers/layers.py | 129 +++++++++++++++--- .../test_cross_entropy_over_beam.protostr | 17 ++- .../configs/test_cross_entropy_over_beam.py | 18 ++- 7 files changed, 162 insertions(+), 62 deletions(-) diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp index 09258fb305990..f7736f0ce905f 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp +++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp @@ -161,7 +161,17 @@ real CostForOneSequence::forward() { } void CostForOneSequence::backward() { + /* + * when softmax layer is the output layer, and it is combined with + * cross-entropy as cost. The derivate with regard to softmax's input + * is simply: + * + * grad_i = softmax_out_i - target_i, + * + * and here hard label is used. + */ softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.; + MatrixPtr tmp = Matrix::create( softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false); diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h index 96a5df7dfbe46..5d0cffee3c159 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.h +++ b/paddle/gserver/layers/CrossEntropyOverBeam.h @@ -19,8 +19,8 @@ limitations under the License. */ namespace paddle { +/* This struct stores the beams in all search steps for a single sequence. */ struct BeamExpansion { - // store the entire beam expansion for a single sequence std::vector scores; std::vector seqInfo; @@ -111,8 +111,11 @@ class CrossEntropyOverBeam : public Layer { size_t batchSize_; size_t beamSize_; - // Currently, this layer only works on CPU, if its inputs is on GPU, - // copy them to CPU memory. + /* + * the process of constructing beams is not friendly to GPU, currently, this + * layer only runs on CPU, if any of its inputs is on GPU memory, then copy + * it to CPU memory. + */ std::vector candidateScores_; std::vector candidateScoreGrad_; std::vector candidateInBeam_; @@ -120,9 +123,12 @@ class CrossEntropyOverBeam : public Layer { std::vector goldSequence_; std::vector> beamSplitPos_; - // split entire bath of beams into beam per sequnence. + /* + * split entire bath of beams into beam per sequnence and store the result + * into this member. + */ std::vector beamPerSeq_; - // beamCosts_ is used to propagate error in one sequence. + /* beamCosts_ is used to propagate error in one sequence. */ std::vector beamCosts_; }; diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp index 506a4281df4f0..538d18cdc3d26 100644 --- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp +++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp @@ -28,16 +28,10 @@ using namespace paddle; // NOLINT DECLARE_int32(gpu_id); DECLARE_bool(thread_local_rand_use_global_seed); -// const size_t MAX_SEQ_NUM = 5; -// const size_t MAX_SEQ_LEN = 10; -// const size_t MAX_BEAM_SIZE = 3; - const size_t MAX_SEQ_NUM = 23; const size_t MAX_SEQ_LEN = 50; const size_t MAX_BEAM_SIZE = 27; -// const size_t SEED = 1503391792; -// const size_t SEED = 1; const size_t SEED = (size_t)(time(NULL)); struct SingleBeamExpansion { @@ -176,10 +170,12 @@ void genGroundTruth(vector& beamExpansions, beam.resetGroundTruth(seqNum); for (size_t i = 0; i < seqNum; ++i) { if (randFloat() > 0.5) { - // force the randomly generated label falls in the beam by chance 0.5. - // otherwise, when sequence length is relatively long and beam size is - // relatively small, the gold sequences falls off the beam at in - // the first search. + /* + * force the randomly generated label falls in the beam by chance 0.5. + * otherwise, when sequence length is relatively long and beam size is + * relatively small, the gold sequences falls off the beam at in the + * first search. + */ real* begPos = beam.selectedIndices.data() + i * beamSize; beam.colIdxInBeam[i] = rand() % count_if(begPos, begPos + beamSize, [](const real& val) { @@ -222,9 +218,7 @@ void genGroundTruth(vector& beamExpansions, if (randFloat() > 0.5) { // force the randomly generated label falls in the beam by chance 0.5. - // otherwise, when sequence length is relatively long and beam size is - // relatively small, the gold sequences falls off the beam at in - // the first search. + real* start = curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize; int n = rand() % count_if(start, start + beamSize, [](const real& val) { @@ -339,7 +333,7 @@ TEST(Layer, CrossEntropyOverBeam) { const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE; LOG(INFO) << "beamSize = " << beamSize; - // TODO(caoying): test with more beam expansions. + // TODO(caoying): test with random beam expansions. const size_t expansionCount = 3; vector beams; genRandomBeamExpansion(expansionCount, beamSize, beams); diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 7707ece819c9e..579713546f15e 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1605,16 +1605,16 @@ def __init__(self, name, inputs, softmax_selfnorm_alpha=0.1, **xargs): @config_layer('cross_entropy_over_beam') class CrossEntropyOverBeamLayer(LayerBase): def __init__(self, name, inputs, **xargs): - config_assert(len(inputs) % 3 == 0, "Error input numbers.") + config_assert(len(inputs) % 3 == 0, "Error input number.") super(CrossEntropyOverBeamLayer, self).__init__( name, 'cross_entropy_over_beam', 0, inputs, **xargs) input_num = len(inputs) / 3 for i in range(input_num): - input_layer = self.get_input_layer(i * 2) - config_assert( - input_layer.size == 1, "Inputs for this layer are made up of " - "several pairs and the first one in a pair is scores for " - "all the candidates, so its size should be equal to 1.") + input_layer = self.get_input_layer(i * 3) + config_assert(input_layer.size == 1, ( + "Inputs for this layer are made up of " + "several triples, in which the first one is scores over " + "all candidate paths, whose size should be equal to 1.")) @config_layer('fc') diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index b027f84b5d576..053c92d005f7a 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -103,6 +103,7 @@ 'nce_layer', 'cross_entropy_with_selfnorm', 'cross_entropy', + 'BeamInput', 'cross_entropy_over_beam', 'multi_binary_label_cross_entropy', 'sum_cost', @@ -5681,10 +5682,10 @@ def multi_binary_label_cross_entropy(input, if input.activation is None or \ not isinstance(input.activation, SigmoidActivation): - logger.log( - logging.WARN, - "%s is not recommend for multi_binary_label_cross_entropy's activation, " - "maybe the sigmoid is better" % repr(input.activation)) + logger.log(logging.WARN, + ("%s is not a recommended activation for " + "multi_binary_label_cross_entropy, sigmoid is better") % + repr(input.activation)) Layer( name=name, @@ -5699,26 +5700,110 @@ def multi_binary_label_cross_entropy(input, size=1) +class BeamInput(object): + """ + Define the input for cross_entropy_over_beam layer. + + A beam is made up of a triple: the first one is scores over all + candidates; the second one is indices of top k selected candidates; the + third one is the index of ground truth, which is also always called + gold. + """ + + def __init__(self, candidate_scores, selected_candidates, gold): + assert isinstance(candidate_scores, LayerOutput) + self.candidate_scores = candidate_scores + assert candidate_scores.size == 1 + + assert isinstance(selected_candidates, LayerOutput) + self.selected_candidates = selected_candidates + + assert isinstance(gold, LayerOutput) + self.gold = gold + + @wrap_name_default() @layer_support() -def cross_entropy_over_beam(input, label, name=None, coeff=1.0, weight=None): - """ - TODO(caoying) add comments. +def cross_entropy_over_beam(input, name=None): """ + This layer is used in learning to search models, which is to solve complex + joint prediction problems based on learning to search through a + problem-defined search space. - assert len(input) / 2 == len(label), "Error input numbers." - for i in range(0, len(input), 2): - assert (input[i].size == 1), ( - "Inputs for this layer are made up of " - "several pairs and the first one in a pair is scores for " - "all the candidates, so its size should be equal to 1.") + Specifically, the learning to search process for this layer begins with + searching a target sequence from a nested sequence. In the first search + step, top beam size sequences with highest scores, indices of these top k + sequences in the original nested sequence, and the ground truth (also + called gold) altogether (a triple) make up of the first beam. - ipts, parents = __cost_input__(input, label, weight) - Layer( - name=name, - type=LayerType.CROSS_ENTROPY_OVER_BEAM, - inputs=ipts, - coeff=coeff) + Then, several special positions, for example, start and end positions + that define meaningful segments are searched. In these searches, top k + positions with highest scores are selected, and then sequence, starting + from the selected starts till ends of the sequences (or a fixed position) + are taken to search next. + + We call the possible top k results returned in one search the beam. This + search process can be repeated for pre-defined turns and leads to several + beam expansions. + + Finally, the layer cross_entropy_over_beam takes all the beam expansions + which contain several candidate targets found along the multi-step search. + cross_entropy_over_beam calculates cross entropy over the expanded beams + which all the candidates in the beam as the normalized factor. + + Note that, if gold falls off the beam at search step t, then the cost is + calculated over the beam at step t. + + This cost layer always works together with kmax_sequence_score_layer, + sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a + sub-search space. + + + The example usage is: + + .. code-block:: python + + cost = cross_entropy_over_beam(input=[ + BeamInput( + candidate_scores=beam1_candidates, + selected_candidates=beam1_topk, + gold=gold1), + BeamInput( + candidate_scores=beam2_candidates, + selected_candidates=beam2_topk, + gold=gold2), + ]) + + + :param input: input beams for this layer. + :type input: BeamInput + :param name: input beams for this layer. + :type name: basestring + :return: LayerOutput object. + :rtype: LayerOutput + """ + + if isinstance(input, BeamInput): + input = [input] + else: + assert isinstance(input, list), ( + 'input for cross_entropy_over_beam shold be a python list ' + 'of BeamInput object.') + for ipt in input: + assert isinstance(ipt, BeamInput), ( + 'input for cross_entropy_over_beam ' + 'should be a BeamInput object.') + + ipts = [] + parents = [] + for beam in input: + parents += [beam.candidate_scores, beam.selected_candidates, beam.gold] + ipts += [ + beam.candidate_scores.name, beam.selected_candidates.name, + beam.gold.name + ] + + Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts) return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1) @@ -6247,11 +6332,11 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1): @wrap_bias_attr_default() def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None): """ - A layer applies a linear transformation to each element in each row of - the input matrix. For each element, the layer first re-scale it and then + A layer applies a linear transformation to each element in each row of + the input matrix. For each element, the layer first re-scale it and then adds a bias to it. - This layer is very like the SlopeInterceptLayer, except the scale and + This layer is very like the SlopeInterceptLayer, except the scale and bias are trainable. .. math:: diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr index e44478ec2ba1f..c43fc48e22204 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr @@ -114,27 +114,26 @@ layers { input_layer_name: "__kmax_sequence_score_layer_0__" } inputs { - input_layer_name: "__fc_layer_0__" + input_layer_name: "sentences_ids" } inputs { - input_layer_name: "__kmax_sequence_score_layer_1__" + input_layer_name: "__fc_layer_0__" } inputs { - input_layer_name: "__fc_layer_1__" + input_layer_name: "__kmax_sequence_score_layer_1__" } inputs { - input_layer_name: "__kmax_sequence_score_layer_2__" + input_layer_name: "start_ids" } inputs { - input_layer_name: "sentences_ids" + input_layer_name: "__fc_layer_1__" } inputs { - input_layer_name: "start_ids" + input_layer_name: "__kmax_sequence_score_layer_2__" } inputs { input_layer_name: "end_ids" } - coeff: 1.0 } parameters { name: "___fc_layer_0__.w0" @@ -177,8 +176,8 @@ parameters { initial_smart: false } input_layer_names: "sentence_scores" -input_layer_names: "sentence_states" input_layer_names: "sentences_ids" +input_layer_names: "sentence_states" input_layer_names: "start_ids" input_layer_names: "end_ids" output_layer_names: "__cross_entropy_over_beam_0__" @@ -198,8 +197,8 @@ sub_models { layer_names: "end_ids" layer_names: "__cross_entropy_over_beam_0__" input_layer_names: "sentence_scores" - input_layer_names: "sentence_states" input_layer_names: "sentences_ids" + input_layer_names: "sentence_states" input_layer_names: "start_ids" input_layer_names: "end_ids" output_layer_names: "__cross_entropy_over_beam_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py index edc2d32fca1c9..240e703dc904e 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py @@ -29,11 +29,17 @@ sentence_idx = data_layer(name="sentences_ids", size=1) start_idx = data_layer(name="start_ids", size=1) end_idx = data_layer(name="end_ids", size=1) -cost = cross_entropy_over_beam( - input=[ - sentence_scores, topk_sentence_ids, start_pos_scores, - topk_start_pos_ids, end_pos_scores, topk_end_pos_ids - ], - label=[sentence_idx, start_idx, end_idx]) +cost = cross_entropy_over_beam(input=[ + BeamInput( + candidate_scores=sentence_scores, + selected_candidates=topk_sentence_ids, + gold=sentence_idx), BeamInput( + candidate_scores=start_pos_scores, + selected_candidates=topk_start_pos_ids, + gold=start_idx), BeamInput( + candidate_scores=end_pos_scores, + selected_candidates=topk_end_pos_ids, + gold=end_idx) +]) outputs(cost) From 7a42c92d49cbcf05bb7c8fc698b923a09503d22e Mon Sep 17 00:00:00 2001 From: caoying03 Date: Thu, 24 Aug 2017 10:53:00 +0800 Subject: [PATCH 6/9] fix a bug that memory does not clean. --- .../gserver/layers/CrossEntropyOverBeam.cpp | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp index f7736f0ce905f..b7c2a44626595 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp +++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp @@ -53,8 +53,8 @@ size_t CostForOneSequence::initLastExpansion() { candidates->getData() + height * beamSize_, [](const real& val) { return val != -1; }); /* - * if the gold sequence falls off the beam during search, - * add the gold sequence as the last path into all expanded paths. + * if the gold sequence falls off the beam during search, add the gold + * sequence as the last path into the all expanded candidates. */ if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++; @@ -133,7 +133,7 @@ real CostForOneSequence::globallyNormalizedScore() { Matrix::resizeOrCreate( softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false); - softmaxOut_->zero(); + softmaxOut_->zeroMem(); MatrixPtr tmp = Matrix::create( softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false); @@ -143,6 +143,8 @@ real CostForOneSequence::globallyNormalizedScore() { 1, false, false); + expandedPathScores_[i]->zeroMem(); + IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(), pathRowIdsInEachBeam_[i].size(), false); @@ -217,13 +219,16 @@ void CrossEntropyOverBeam::checkInputs() { const Argument& goldSeq = getInput(i * 3 + 2); if (i) { - CHECK(scores.hasSubseq()) << "Beam expansion expect the first one, " - "should be a nested sequence"; + CHECK(scores.hasSubseq()) << "input " << i << " " + << inputLayers_[i * 3]->getName() + << " should be a nested sequence"; CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_); CHECK_EQ(scores.getNumSequences(), batchSize_); CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize()); } else { - CHECK(scores.hasSeq()) << "The first beam expansion should be a sequence"; + CHECK(scores.hasSeq()) << "input " << i << " " + << inputLayers_[i]->getName() + << " should be a sequence"; batchSize_ = scores.getNumSequences(); beamSize_ = getInputValue(i * 3 + 1)->getWidth(); CHECK_EQ(batchSize_, selCandidates.getBatchSize()); @@ -332,7 +337,7 @@ void CrossEntropyOverBeam::splitBatchBeams() { void CrossEntropyOverBeam::resizeOutput() { Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false); - output_.value->zero(); + output_.value->zeroMem(); for (size_t i = 0; i < beamExpanCount_; ++i) { MatrixPtr inGrad = getInputGrad(i * 3); @@ -344,7 +349,7 @@ void CrossEntropyOverBeam::resizeOutput() { false); } else candidateScoreGrad_[i] = std::move(inGrad); - candidateScoreGrad_[i]->zero(); + candidateScoreGrad_[i]->zeroMem(); } } From 7035bb63e91a2dcf1f91df5e440d2c3e45bdd2e8 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Fri, 25 Aug 2017 20:44:04 +0800 Subject: [PATCH 7/9] fix a bug. --- paddle/parameter/Argument.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 2b945de18a4cd..b0e9e740c84e6 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -677,6 +677,7 @@ void Argument::reorganizeSeqInfo( const ICpuGpuVectorPtr subSeqStartPos, std::vector>& reorganizedSeqInfo) { CHECK(seqStartPos); + reorganizedSeqInfo.clear(); int seqNum = seqStartPos->getSize() - 1; int* seqStarts = seqStartPos->getMutableData(false); From 09e903eb9417745952ced6db532594fd4a759d74 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Tue, 29 Aug 2017 13:44:51 +0800 Subject: [PATCH 8/9] fix v2 infer interface. --- paddle/gserver/layers/CrossEntropyOverBeam.cpp | 1 - python/paddle/v2/inference.py | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp index 500cd6ff8ccc6..bffcc30154370 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp +++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp @@ -39,7 +39,6 @@ void CostForOneSequence::calValidExpandStep() { if (start + beamSize_ == findEnd) return; goldColIds_[i] = findEnd - start; } - if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false; } diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 4dcc3ab57e7e6..8acea6155c588 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -70,7 +70,7 @@ def iter_infer_field(self, field, **kwargs): item = [each_result[each_field] for each_field in field] yield item - def infer(self, input, field='value', **kwargs): + def infer(self, input, field='value', flatten_result=True, **kwargs): """ Infer a data by model. :param input: input data batch. Should be python iterable object. @@ -83,7 +83,10 @@ def infer(self, input, field='value', **kwargs): retv = [[] for i in xrange(len(result))] for i, item in enumerate(result): retv[i].append(item) - retv = [numpy.concatenate(out) for out in retv] + + if flatten_result: + retv = [numpy.concatenate(out) for out in retv] + if len(retv) == 1: return retv[0] else: From 36f0aa7390e3044b8e26d1787f99ed5edaf27ed0 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Thu, 31 Aug 2017 13:06:22 +0800 Subject: [PATCH 9/9] fix code style to pass CI. --- paddle/gserver/layers/CrossEntropyOverBeam.cpp | 11 +++++++---- paddle/gserver/layers/CrossEntropyOverBeam.h | 6 +++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp index bffcc30154370..4acc077035b17 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp +++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp @@ -28,8 +28,9 @@ void CostForOneSequence::calValidExpandStep() { start, start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1], [](const real& val) { return val != -1.; }); - } else + } else { goldRowIds_[i] = 0; + } real* start = beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_; @@ -288,7 +289,7 @@ void CrossEntropyOverBeam::copyInputsToCpu() { void CrossEntropyOverBeam::splitBatchBeams() { beamCosts_.resize(batchSize_); - beamPerSeq_.resize(batchSize_, beamExpanCount_); + beamPerSeq_.resize(batchSize_, BeamExpansion(beamExpanCount_)); for (size_t i = 0; i < beamExpanCount_; ++i) { int* seqStarts = @@ -300,8 +301,9 @@ void CrossEntropyOverBeam::splitBatchBeams() { subSeqStarts = getInput(i * 3).subSequenceStartPositions->getMutableData(false); maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1; - } else + } else { maxLen = getInput(i).sequenceStartPositions->getSize() - 1; + } for (size_t j = 0; j < batchSize_; ++j) { beamPerSeq_[j].scores[i] = @@ -348,8 +350,9 @@ void CrossEntropyOverBeam::resizeOutput() { inGrad->getWidth(), false, false); - } else + } else { candidateScoreGrad_[i] = std::move(inGrad); + } candidateScoreGrad_[i]->zeroMem(); } } diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h index 5d0cffee3c159..5643556f43370 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.h +++ b/paddle/gserver/layers/CrossEntropyOverBeam.h @@ -31,7 +31,7 @@ struct BeamExpansion { size_t expansionCount; - BeamExpansion(int n) { + explicit BeamExpansion(int n) { expansionCount = n; scores.resize(expansionCount); seqInfo.resize(expansionCount); @@ -39,7 +39,7 @@ struct BeamExpansion { scoreGrad.resize(expansionCount); gold.resize(expansionCount); - }; + } }; typedef std::shared_ptr BeamExpansionPtr; @@ -74,7 +74,7 @@ class CostForOneSequence { CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId); int* starts = beams_->seqInfo[beamId]->getData(); return starts[rowId] - starts[0]; - }; + } size_t beamSize_; size_t validExpansionCount_;