diff --git a/.travis.yml b/.travis.yml index 5b14f8e61e614..047ca6ffe79bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -56,7 +56,7 @@ before_install: - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme pre-commit + - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme pre-commit requests==2.9.2 LinkChecker script: - paddle/scripts/travis/main.sh notifications: diff --git a/cmake/util.cmake b/cmake/util.cmake index 38366373c6dbc..8a71b23c62d9f 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -96,6 +96,7 @@ function(link_paddle_exe TARGET_NAME) target_circle_link_libraries(${TARGET_NAME} ARCHIVE_START paddle_gserver + paddle_function ${METRIC_LIBS} ARCHIVE_END paddle_pserver @@ -106,6 +107,7 @@ function(link_paddle_exe TARGET_NAME) paddle_parameter paddle_proto paddle_cuda + paddle_test_main ${METRIC_LIBS} ${PROTOBUF_LIBRARY} ${LIBGLOG_LIBRARY} diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py index 042cd4e7a9e25..360c57ea6283c 100644 --- a/demo/semantic_role_labeling/dataprovider.py +++ b/demo/semantic_role_labeling/dataprovider.py @@ -43,7 +43,7 @@ def get_batch_size(yeild_data): init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, - can_over_batch_size=False, + can_over_batch_size=True, cache=CacheType.CACHE_PASS_IN_MEM) def process(settings, file_name): with open(file_name, 'r') as fdata: diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index 7633bf4d576ee..57725c0d85997 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -39,12 +39,20 @@ The general development workflow with Docker and Bazel is as follows: code. This image contains all the development tools and dependencies of PaddlePaddle. - .. code-block:: bash cd paddle docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile . + Sometimes docker build might suffer from a slow network connection to the official Ubuntu apt-source servers. In such case, we can specify an apt-source mirror server that is geologically nearer to us. In the following example, we specified an apt-source server that responds fast in China.You can specify the UBUNTU MIRROR with :code:`--build-arg UBUNTU_MIRROR` like the example below. + + .. code-block:: bash + + docker build \ + --build-arg UBUNTU_MIRROR="http://mirrors.163.com" \ + -t paddle:dev \ + -f paddle/scripts/docker/Dockerfile . + 3. Run the image as a container and mounting local source code directory into the container. This allows us to change the code on diff --git a/doc/howto/deep_model/rnn/rnn_cn.md b/doc/howto/deep_model/rnn/rnn_cn.md new file mode 100644 index 0000000000000..5ec05b2cab9ba --- /dev/null +++ b/doc/howto/deep_model/rnn/rnn_cn.md @@ -0,0 +1,226 @@ +RNN 配置 +================= + +本教程将指导你如何在 PaddlePaddle 中配置循环神经网络(RNN)。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中,您将了解如何: + +- 准备用来学习循环神经网络的序列数据。 +- 配置循环神经网络架构。 +- 使用学习完成的循环神经网络模型生成序列。 + +我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence 模型的代码可以在`demo / seqToseq`找到。 + +准备序列数据 +--------------------- + +PaddlePaddle 不需要对序列数据进行任何预处理,例如填充。唯一需要做的是将相应类型设置为输入。例如,以下代码段定义了三个输入。 它们都是序列,它们的大小是`src_dict`,`trg_dict`和`trg_dict`: + +``` sourceCode +settings.input_types = [ + integer_value_sequence(len(settings.src_dict)), + integer_value_sequence(len(settings.trg_dict)), + integer_value_sequence(len(settings.trg_dict))] +``` + +在`process`函数中,每个`yield`函数将返回三个整数列表。每个整数列表被视为一个整数序列: + +``` sourceCode +yield src_ids, trg_ids, trg_ids_next +``` + +有关如何编写数据提供程序的更多细节描述,请参考 [PyDataProvider2](../../ui/data_provider/index.html)。完整的数据提供文件在 `demo/seqToseq/dataprovider.py`。 + +配置循环神经网络架构 +----------------------------------------------- + +### 简单门控循环神经网络(Gated Recurrent Neural Network) + +循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。 + +![image](../../../tutorials/sentiment_analysis/bi_lstm.jpg) + +一般来说,循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t* = 1 执行以下操作。 + +*x**t* + 1 = *f**x*(*x**t*),*y**t* = *f**y*(*x**t*) + +其中 *f**x*(.) 称为**单步函数**(即单时间步执行的函数,step function),而 *f**y*(.) 称为**输出函数**。在 vanilla 循环神经网络中,单步函数和输出函数都非常简单。然而,PaddlePaddle 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to sequence 模型演示如何配置复杂的循环神经网络模型。在本节中,我们将使用简单的 vanilla 循环神经网络作为使用`recurrent_group`配置简单循环神经网络的例子。 注意,如果你只需要使用简单的RNN,GRU或LSTM,那么推荐使用`grumemory`和`lstmemory`,因为它们的计算效率比`recurrent_group`更高。 + +对于 vanilla RNN,在每个时间步长,**单步函数**为: + +*x**t* + 1 = *W**x**x**t* + *W**i**I**t* + *b* + +其中 *x**t* 是RNN状态,并且 *I**t* 是输入,*W**x* 和 *W**i* 分别是RNN状态和输入的变换矩阵。*b* 是偏差。它的**输出函数**只需要*x**t*作为输出。 + +`recurrent_group`是构建循环神经网络的最重要的工具。 它定义了**单步函数**,**输出函数**和循环神经网络的输入。注意,这个函数的`step`参数需要实现`step function`(单步函数)和`output function`(输出函数): + + +``` sourceCode +def simple_rnn(input, + size=None, + name=None, + reverse=False, + rnn_bias_attr=None, + act=None, + rnn_layer_attr=None): + def __rnn_step__(ipt): + out_mem = memory(name=name, size=size) + rnn_out = mixed_layer(input = [full_matrix_projection(ipt), + full_matrix_projection(out_mem)], + name = name, + bias_attr = rnn_bias_attr, + act = act, + layer_attr = rnn_layer_attr, + size = size) + return rnn_out + return recurrent_group(name='%s_recurrent_group' % name, + step=__rnn_step__, + reverse=reverse, + input=input) +``` + +PaddlePaddle 使用“Memory”(记忆模块)实现单步函数。**Memory**是在PaddlePaddle中构造循环神经网络时最重要的概念。 Memory是在单步函数中循环使用的状态,例如*x**t* + 1 = *f**x*(*x**t*)。 一个Memory包含**输出**和**输入**。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有**boot layer(引导层)**,其输出被用作Memory的初始值。 在我们的例子中,门控循环单元的输出被用作输出Memory。请注意,`rnn_out`层的名称与`out_mem`的名称相同。这意味着`rnn_out` (*x**t* + 1)的输出被用作`out_mem`Memory的**输出**。 + +Memory也可以是序列。在这种情况下,在每个时间步中,我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。 其他高级功能包括定义多个Memory,以及使用子序列来定义分级循环神经网络架构。 + +我们在函数的结尾返回`rnn_out`。 这意味着 `rnn_out` 层的输出被用作门控循环神经网络的**输出**函数。 + +### Sequence to Sequence Model with Attention + +我们将使用 sequence to sequence model with attention 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。 + +![image](../../../tutorials/text_generation/encoder-decoder-attention-model.png) + +在这个模型中,源序列 *S* = {*s*1, …, *s**T*} 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态 *H**S* = {*H*1, …, *H**T*} 被称为 *编码向量*。解码器是门控循环神经网络。当解读每一个*y**t*时, 这个门控循环神经网络生成一系列权重 *W**S**t* = {*W*1*t*, …, *W**T**t*}, 用于计算编码向量的加权和。加权和用来生成*y**t*。 + +模型的编码器部分如下所示。它叫做`grumemory`来表示门控循环神经网络。如果网络架构简单,那么推荐使用循环神经网络的方法,因为它比 `recurrent_group` 更快。我们已经实现了大多数常用的循环神经网络架构,可以参考 [Layers](../../ui/api/trainer_config_helpers/layers_index.html) 了解更多细节。 + +我们还将编码向量投射到 `decoder_size` 维空间。这通过获得反向循环网络的第一个实例,并将其投射到 `decoder_size` 维空间完成: + +``` sourceCode +# 定义源语句的数据层 +src_word_id = data_layer(name='source_language_word', size=source_dict_dim) +# 计算每个词的词向量 +src_embedding = embedding_layer( + input=src_word_id, + size=word_vector_dim, + param_attr=ParamAttr(name='_source_language_embedding')) +# 应用前向循环神经网络 +src_forward = grumemory(input=src_embedding, size=encoder_size) +# 应用反向递归神经网络(reverse=True表示反向循环神经网络) +src_backward = grumemory(input=src_embedding, + size=encoder_size, + reverse=True) +# 将循环神经网络的前向和反向部分混合在一起 +encoded_vector = concat_layer(input=[src_forward, src_backward]) + +# 投射编码向量到 decoder_size +encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)], + size = decoder_size) + +# 计算反向RNN的第一个实例 +backward_first = first_seq(input=src_backward) + +# 投射反向RNN的第一个实例到 decoder size +decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation()) +``` + +解码器使用 `recurrent_group` 来定义循环神经网络。单步函数和输出函数在 `gru_decoder_with_attention` 中定义: + +``` sourceCode +group_inputs=[StaticInput(input=encoded_vector,is_seq=True), + StaticInput(input=encoded_proj,is_seq=True)] +trg_embedding = embedding_layer( + input=data_layer(name='target_language_word', + size=target_dict_dim), + size=word_vector_dim, + param_attr=ParamAttr(name='_target_language_embedding')) +group_inputs.append(trg_embedding) + +# 对于配备有注意力机制的解码器,在训练中, +# 目标向量(groudtruth)是数据输入, +# 而源序列的编码向量可以被无边界的memory访问 +# StaticInput 意味着不同时间步的输入都是相同的值, +# 否则它以一个序列输入,不同时间步的输入是不同的。 +# 所有输入序列应该有相同的长度。 +decoder = recurrent_group(name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) +``` + +单步函数的实现如下所示。首先,它定义解码网络的**Memory**。然后定义 attention,门控循环单元单步函数和输出函数: + +``` sourceCode +def gru_decoder_with_attention(enc_vec, enc_proj, current_word): + # 定义解码器的Memory + # Memory的输出定义在 gru_step 内 + # 注意 gru_step 应该与它的Memory名字相同 + decoder_mem = memory(name='gru_decoder', + size=decoder_size, + boot_layer=decoder_boot) + # 计算 attention 加权编码向量 + context = simple_attention(encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + # 混合当前词向量和attention加权编码向量 + decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context), + full_matrix_projection(current_word)], + size = decoder_size * 3) + # 定义门控循环单元循环神经网络单步函数 + gru_step = gru_step_layer(name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + # 定义输出函数 + out = mixed_layer(input=[full_matrix_projection(input=gru_step)], + size=target_dict_dim, + bias_attr=True, + act=SoftmaxActivation()) + return out +``` + +生成序列 +----------------- + +训练模型后,我们可以使用它来生成序列。通常的做法是使用**beam search** 生成序列。以下代码片段定义 beam search 算法。注意,`beam_search` 函数假设 `step` 的输出函数返回的是下一个时刻输出词的 softmax 归一化概率向量。我们对模型进行了以下更改。 + +- 使用 `GeneratedInput` 来表示 trg\_embedding。 `GeneratedInput` 将上一时间步所生成的词的向量来作为当前时间步的输入。 +- 使用 `beam_search` 函数。这个函数需要设置: + - `bos_id`: 开始标记。每个句子都以开始标记开头。 + - `eos_id`: 结束标记。每个句子都以结束标记结尾。 + - `beam_size`: beam search 算法中的beam大小。 + - `max_length`: 生成序列的最大长度。 +- 使用 `seqtext_printer_evaluator` 根据索引矩阵和字典打印文本。这个函数需要设置: + - `id_input`: 数据的整数ID,用于标识生成的文件中的相应输出。 + - `dict_file`: 用于将词ID转换为词的字典文件。 + - `result_file`: 生成结果文件的路径。 + +代码如下: + +``` sourceCode +group_inputs=[StaticInput(input=encoded_vector,is_seq=True), + StaticInput(input=encoded_proj,is_seq=True)] +# 在生成时,解码器基于编码源序列和最后生成的目标词预测下一目标词。 +# 编码源序列(编码器输出)必须由只读Memory的 StaticInput 指定。 +# 这里, GeneratedInputs 自动获取上一个生成的词,并在最开始初始化为起始词,如 。 +trg_embedding = GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) +group_inputs.append(trg_embedding) +beam_gen = beam_search(name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, # Beginnning token. + eos_id=1, # End of sentence token. + beam_size=beam_size, + max_length=max_length) + +seqtext_printer_evaluator(input=beam_gen, + id_input=data_layer(name="sent_id", size=1), + dict_file=trg_dict_path, + result_file=gen_trans_file) +outputs(beam_gen) +``` + +注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 [Semantic Role Labeling Demo](../../demo/semantic_role_labeling/index.html) 了解更多详细信息。 + +完整的配置文件在`demo/seqToseq/seqToseq_net.py`。 diff --git a/doc/howto/deep_model/rnn_config_cn.rst b/doc/howto/deep_model/rnn_config_cn.rst new file mode 100644 index 0000000000000..e6d8c1133a5e8 --- /dev/null +++ b/doc/howto/deep_model/rnn_config_cn.rst @@ -0,0 +1,287 @@ +RNN 配置 +======== + +本教程将指导你如何在 PaddlePaddle +中配置循环神经网络(RNN)。PaddlePaddle +高度支持灵活和高效的循环神经网络配置。 在本教程中,您将了解如何: + +- 准备用来学习循环神经网络的序列数据。 +- 配置循环神经网络架构。 +- 使用学习完成的循环神经网络模型生成序列。 + +我们将使用 vanilla 循环神经网络和 sequence to sequence +模型来指导你完成这些步骤。sequence to sequence +模型的代码可以在\ ``demo / seqToseq``\ 找到。 + +准备序列数据 +------------ + +PaddlePaddle +不需要对序列数据进行任何预处理,例如填充。唯一需要做的是将相应类型设置为输入。例如,以下代码段定义了三个输入。 +它们都是序列,它们的大小是\ ``src_dict``\ ,\ ``trg_dict``\ 和\ ``trg_dict``\ : + +.. code:: sourcecode + + settings.input_types = [ + integer_value_sequence(len(settings.src_dict)), + integer_value_sequence(len(settings.trg_dict)), + integer_value_sequence(len(settings.trg_dict))] + +在\ ``process``\ 函数中,每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列: + +.. code:: sourcecode + + yield src_ids, trg_ids, trg_ids_next + +有关如何编写数据提供程序的更多细节描述,请参考 +`PyDataProvider2 <../../ui/data_provider/index.html>`__\ 。完整的数据提供文件在 +``demo/seqToseq/dataprovider.py``\ 。 + +配置循环神经网络架构 +-------------------- + +简单门控循环神经网络(Gated Recurrent Neural Network) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。 + +.. figure:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg + :alt: image + + image + +一般来说,循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t* += 1 执行以下操作。 + +*x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ ),\ *y*\ \ *t*\  = *f*\ \ *y*\ (*x*\ \ *t*\ ) + +其中 *f*\ \ *x*\ (.) 称为\ **单步函数**\ (即单时间步执行的函数,step +function),而 *f*\ \ *y*\ (.) 称为\ **输出函数**\ 。在 vanilla +循环神经网络中,单步函数和输出函数都非常简单。然而,PaddlePaddle +可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to +sequence +模型演示如何配置复杂的循环神经网络模型。在本节中,我们将使用简单的 +vanilla +循环神经网络作为使用\ ``recurrent_group``\ 配置简单循环神经网络的例子。 +注意,如果你只需要使用简单的RNN,GRU或LSTM,那么推荐使用\ ``grumemory``\ 和\ ``lstmemory``\ ,因为它们的计算效率比\ ``recurrent_group``\ 更高。 + +对于 vanilla RNN,在每个时间步长,\ **单步函数**\ 为: + +*x*\ \ *t* + 1 = *W*\ \ *x*\ \ *x*\ \ *t*\  + *W*\ \ *i*\ \ *I*\ \ *t*\  + *b* + +其中 *x*\ \ *t*\ 是RNN状态,并且 *I*\ \ *t*\ 是输入,\ *W*\ \ *x*\ 和 +*W*\ \ *i*\ 分别是RNN状态和输入的变换矩阵。\ *b* +是偏差。它的\ **输出函数**\ 只需要\ *x*\ \ *t*\ 作为输出。 + +``recurrent_group``\ 是构建循环神经网络的最重要的工具。 +它定义了\ **单步函数**\ ,\ **输出函数**\ 和循环神经网络的输入。注意,这个函数的\ ``step``\ 参数需要实现\ ``step function``\ (单步函数)和\ ``output function``\ (输出函数): + +.. code:: sourcecode + + def simple_rnn(input, + size=None, + name=None, + reverse=False, + rnn_bias_attr=None, + act=None, + rnn_layer_attr=None): + def __rnn_step__(ipt): + out_mem = memory(name=name, size=size) + rnn_out = mixed_layer(input = [full_matrix_projection(ipt), + full_matrix_projection(out_mem)], + name = name, + bias_attr = rnn_bias_attr, + act = act, + layer_attr = rnn_layer_attr, + size = size) + return rnn_out + return recurrent_group(name='%s_recurrent_group' % name, + step=__rnn_step__, + reverse=reverse, + input=input) + +PaddlePaddle +使用“Memory”(记忆模块)实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。 +Memory是在单步函数中循环使用的状态,例如\ *x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ )。 +一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot +layer(引导层)**\ ,其输出被用作Memory的初始值。 +在我们的例子中,门控循环单元的输出被用作输出Memory。请注意,\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out`` +(*x*\ \ *t* + 1)的输出被用作\ ``out_mem``\ Memory的\ **输出**\ 。 + +Memory也可以是序列。在这种情况下,在每个时间步中,我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。 +其他高级功能包括定义多个Memory,以及使用子序列来定义分级循环神经网络架构。 + +我们在函数的结尾返回\ ``rnn_out``\ 。 这意味着 ``rnn_out`` +层的输出被用作门控循环神经网络的\ **输出**\ 函数。 + +Sequence to Sequence Model with Attention +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +我们将使用 sequence to sequence model with attention +作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。 + +.. figure:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png + :alt: image + + image + +在这个模型中,源序列 *S* = {*s*\ 1, …, \ *s*\ \ *T*\ } +用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态 +*H*\ \ *S*\  = {*H*\ 1, …, \ *H*\ \ *T*\ } 被称为 +*编码向量*\ 。解码器是门控循环神经网络。当解读每一个\ *y*\ \ *t*\ 时, +这个门控循环神经网络生成一系列权重 +*W*\ \ *S*\ \ *t*\  = {*W*\ 1\ *t*\ , …, \ *W*\ \ *T*\ \ *t*\ }, +用于计算编码向量的加权和。加权和用来生成\ *y*\ \ *t*\ 。 + +模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单,那么推荐使用循环神经网络的方法,因为它比 +``recurrent_group`` +更快。我们已经实现了大多数常用的循环神经网络架构,可以参考 +`Layers <../../ui/api/trainer_config_helpers/layers_index.html>`__ +了解更多细节。 + +我们还将编码向量投射到 ``decoder_size`` +维空间。这通过获得反向循环网络的第一个实例,并将其投射到 +``decoder_size`` 维空间完成: + +.. code:: sourcecode + + # 定义源语句的数据层 + src_word_id = data_layer(name='source_language_word', size=source_dict_dim) + # 计算每个词的词向量 + src_embedding = embedding_layer( + input=src_word_id, + size=word_vector_dim, + param_attr=ParamAttr(name='_source_language_embedding')) + # 应用前向循环神经网络 + src_forward = grumemory(input=src_embedding, size=encoder_size) + # 应用反向递归神经网络(reverse=True表示反向循环神经网络) + src_backward = grumemory(input=src_embedding, + size=encoder_size, + reverse=True) + # 将循环神经网络的前向和反向部分混合在一起 + encoded_vector = concat_layer(input=[src_forward, src_backward]) + + # 投射编码向量到 decoder_size + encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)], + size = decoder_size) + + # 计算反向RNN的第一个实例 + backward_first = first_seq(input=src_backward) + + # 投射反向RNN的第一个实例到 decoder size + decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation()) + +解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在 +``gru_decoder_with_attention`` 中定义: + +.. code:: sourcecode + + group_inputs=[StaticInput(input=encoded_vector,is_seq=True), + StaticInput(input=encoded_proj,is_seq=True)] + trg_embedding = embedding_layer( + input=data_layer(name='target_language_word', + size=target_dict_dim), + size=word_vector_dim, + param_attr=ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # 对于配备有注意力机制的解码器,在训练中, + # 目标向量(groudtruth)是数据输入, + # 而源序列的编码向量可以被无边界的memory访问 + # StaticInput 意味着不同时间步的输入都是相同的值, + # 否则它以一个序列输入,不同时间步的输入是不同的。 + # 所有输入序列应该有相同的长度。 + decoder = recurrent_group(name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + +单步函数的实现如下所示。首先,它定义解码网络的\ **Memory**\ 。然后定义 +attention,门控循环单元单步函数和输出函数: + +.. code:: sourcecode + + def gru_decoder_with_attention(enc_vec, enc_proj, current_word): + # 定义解码器的Memory + # Memory的输出定义在 gru_step 内 + # 注意 gru_step 应该与它的Memory名字相同 + decoder_mem = memory(name='gru_decoder', + size=decoder_size, + boot_layer=decoder_boot) + # 计算 attention 加权编码向量 + context = simple_attention(encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + # 混合当前词向量和attention加权编码向量 + decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context), + full_matrix_projection(current_word)], + size = decoder_size * 3) + # 定义门控循环单元循环神经网络单步函数 + gru_step = gru_step_layer(name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + # 定义输出函数 + out = mixed_layer(input=[full_matrix_projection(input=gru_step)], + size=target_dict_dim, + bias_attr=True, + act=SoftmaxActivation()) + return out + +生成序列 +-------- + +训练模型后,我们可以使用它来生成序列。通常的做法是使用\ **beam search** +生成序列。以下代码片段定义 beam search 算法。注意,\ ``beam_search`` +函数假设 ``step`` 的输出函数返回的是下一个时刻输出词的 softmax +归一化概率向量。我们对模型进行了以下更改。 + +- 使用 ``GeneratedInput`` 来表示 trg\_embedding。 ``GeneratedInput`` + 将上一时间步所生成的词的向量来作为当前时间步的输入。 +- 使用 ``beam_search`` 函数。这个函数需要设置: + + - ``bos_id``: 开始标记。每个句子都以开始标记开头。 + - ``eos_id``: 结束标记。每个句子都以结束标记结尾。 + - ``beam_size``: beam search 算法中的beam大小。 + - ``max_length``: 生成序列的最大长度。 + +- 使用 ``seqtext_printer_evaluator`` + 根据索引矩阵和字典打印文本。这个函数需要设置: + + - ``id_input``: 数据的整数ID,用于标识生成的文件中的相应输出。 + - ``dict_file``: 用于将词ID转换为词的字典文件。 + - ``result_file``: 生成结果文件的路径。 + +代码如下: + +.. code:: sourcecode + + group_inputs=[StaticInput(input=encoded_vector,is_seq=True), + StaticInput(input=encoded_proj,is_seq=True)] + # 在生成时,解码器基于编码源序列和最后生成的目标词预测下一目标词。 + # 编码源序列(编码器输出)必须由只读Memory的 StaticInput 指定。 + # 这里, GeneratedInputs 自动获取上一个生成的词,并在最开始初始化为起始词,如 。 + trg_embedding = GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) + group_inputs.append(trg_embedding) + beam_gen = beam_search(name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, # Beginnning token. + eos_id=1, # End of sentence token. + beam_size=beam_size, + max_length=max_length) + + seqtext_printer_evaluator(input=beam_gen, + id_input=data_layer(name="sent_id", size=1), + dict_file=trg_dict_path, + result_file=gen_trans_file) + outputs(beam_gen) + +注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 +`Semantic Role Labeling +Demo <../../demo/semantic_role_labeling/index.html>`__ +了解更多详细信息。 + +完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。 diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index fb3af8ea92fee..2daea052b01ad 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(cuda) +add_subdirectory(function) add_subdirectory(utils) add_subdirectory(math) add_subdirectory(parameter) diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index a7f17e186bf6b..da6dad10cd807 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -48,6 +48,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp WORKING_DIRECTORY ${PROJ_ROOT}/paddle DEPENDS python_swig_sources paddle_parameter + paddle_function paddle_math paddle_utils paddle_gserver diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py index 51d7dfee58b78..7c8206e3fe097 100644 --- a/paddle/api/paddle_ld_flags.py +++ b/paddle/api/paddle_ld_flags.py @@ -30,8 +30,8 @@ whole_end = "" LIB_DIRS = [ - "math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", - "trainer" + "math", 'function', 'utils', 'parameter', "gserver", "api", "cuda", + "pserver", "trainer" ] PARENT_LIB_DIRS = ['proto'] @@ -75,6 +75,7 @@ def libs_str(self): libs = [ whole_start, "-lpaddle_gserver", + "-lpaddle_function", whole_end, "-lpaddle_pserver", "-lpaddle_trainer_lib", diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h index 06ee3b3654b57..c5787630abbe1 100644 --- a/paddle/cuda/include/hl_cnn.h +++ b/paddle/cuda/include/hl_cnn.h @@ -240,62 +240,6 @@ extern void hl_avgpool_backward(const int frameCnt, real* backGrad, const int outStride); -/** - * @brief Cross-map-respose normalize forward. - * - * @param[in] frameCnt batch size of input image. - * @param[in] in input data. - * @param[in] scale buffer. - * @param[out] out output data. - * @param[in] channels number of channel. - * @param[in] height image height. - * @param[in] width image width. - * @param[in] sizeX size. - * @param[in] alpha scale. - * @param[in] beta scale. - * - */ -extern void hl_CMRNorm_forward(size_t frameCnt, - const real* in, - real* scale, - real* out, - size_t channels, - size_t height, - size_t width, - size_t sizeX, - real alpha, - real beta); - -/** - * @brief Cross-map-respose normalize backward. - * - * @param[in] frameCnt batch size of input image. - * @param[in] inV input data. - * @param[in] scale buffer. - * @param[out] outV output value. - * @param[out] outDiff output grad. - * @param[out] inDiff input grad. - * @param[in] channels number of channel. - * @param[in] height image height. - * @param[in] width image width. - * @param[in] sizeX size. - * @param[in] alpha scale. - * @param[in] beta scale. - * - */ -extern void hl_CMRNorm_backward(size_t frameCnt, - const real* inV, - const real* scale, - const real* outV, - const real* outDiff, - real* inDiff, - size_t channels, - size_t height, - size_t width, - size_t sizeX, - real alpha, - real beta); - /** * @brief Bilinear interpolation forward. * diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h index 52c978735279e..039551c6cc695 100644 --- a/paddle/cuda/include/stub/hl_cnn_stub.h +++ b/paddle/cuda/include/stub/hl_cnn_stub.h @@ -117,30 +117,6 @@ inline void hl_avgpool_backward(const int frameCnt, real* backGrad, const int outStride) {} -inline void hl_CMRNorm_forward(size_t frameCnt, - const real* in, - real* scale, - real* out, - size_t channels, - size_t height, - size_t width, - size_t sizeX, - real alpha, - real beta) {} - -inline void hl_CMRNorm_backward(size_t frameCnt, - const real* inV, - const real* scale, - const real* outV, - const real* outDiff, - real* inDiff, - size_t channels, - size_t height, - size_t width, - size_t sizeX, - real alpha, - real beta) {} - inline void hl_bilinear_forward(const real* inData, const size_t inImgH, const size_t inImgW, diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index 0992286f360fb..b94f4d8fe4a25 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -381,164 +381,6 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad, CHECK_SYNC("hl_avgpool_backward failed"); } -__global__ void KeCMRNormFillScale(size_t nthreads, const real* in, - real* scale, size_t channels, - size_t height, size_t width, size_t size, - real alpha) { - size_t index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < nthreads) { - // find out the local offset - size_t w = index % width; - size_t h = (index / width) % height; - size_t n = index / width / height; - size_t offset = (n * channels * height + h) * width + w; - size_t step = height * width; - in += offset; - scale += offset; - size_t head = 0; - size_t pre_pad = (size - 1) / 2; - size_t post_pad = size - pre_pad - 1; - real accum_scale = 0; - // fill the scale at [n, :, h, w] - // accumulate values - while (head < post_pad) { - accum_scale += in[head * step] * in[head * step]; - ++head; - } - // until we reach size, nothing needs to be subtracted - while (head < size) { - accum_scale += in[head * step] * in[head * step]; - scale[(head - post_pad) * step] = 1. + accum_scale * alpha; - ++head; - } - // both add and subtract - while (head < channels) { - accum_scale += in[head * step] * in[head * step]; - accum_scale -= in[(head - size) * step] * in[(head - size) * step]; - scale[(head - post_pad) * step] = 1. + accum_scale * alpha; - ++head; - } - // subtract only - while (head < channels + post_pad) { - accum_scale -= in[(head - size) * step] * in[(head - size) * step]; - scale[(head - post_pad) * step] = 1. + accum_scale * alpha; - ++head; - } - } -} - - __global__ void KeCMRNormOutput(size_t nthreads, const real* in, - const real* scale, real negative_beta, - real* out) { - size_t index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < nthreads) { - out[index] = in[index] * pow(scale[index], negative_beta); - } -} - -void hl_CMRNorm_forward(size_t frameCnt, const real* in, real* scale, - real* out, size_t channels, - size_t height, size_t width, size_t sizeX, - real alpha, real beta) { - size_t threadsNum = frameCnt * height * width; - size_t blocksX = (threadsNum + 1024 - 1) / 1024; - size_t blocksY = 1; - dim3 threads(1024, 1); - dim3 grid(blocksX, blocksY); - - KeCMRNormFillScale<<>> - (threadsNum, in, scale, channels, height, width, sizeX, alpha); - - threadsNum = frameCnt * height * width *channels; - blocksX = (threadsNum + 1024 -1) / 1024; - dim3 threads2(1024, 1); - dim3 grid2(blocksX, blocksY); - KeCMRNormOutput<<>> - (threadsNum, in, scale, beta, out); - CHECK_SYNC("hl_CMRNorm_forward"); -} - -__global__ void KeCMRNormDiff(size_t nthreads, const real* bottom_data, - const real* top_data, const real* scale, - const real* top_diff, size_t channels, - size_t height, size_t width, size_t size, - real negative_beta, real cache_ratio, - real* bottom_diff ) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < nthreads) { - // find out the local offset - size_t w = index % width; - size_t h = (index / width) % height; - size_t n = index / width / height; - size_t offset = (n * channels * height + h) * width + w; - size_t step = height * width; - bottom_data += offset; - top_data += offset; - scale += offset; - top_diff += offset; - bottom_diff += offset; - int head = 0; - int pre_pad = size - (size + 1) / 2; - int post_pad = size - pre_pad - 1; - real accum_ratio = 0; - // accumulate values - while (head < post_pad) { - accum_ratio += top_diff[head * step] * - top_data[head * step] / scale[head * step]; - ++head; - } - // until we reach size, nothing needs to be subtracted - while (head < size) { - accum_ratio += top_diff[head * step] * - top_data[head * step] / scale[head * step]; - bottom_diff[(head - post_pad) * step] += - top_diff[(head - post_pad) * step] * - pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(head - post_pad) * step] * accum_ratio; - ++head; - } - // both add and subtract - while (head < channels) { - accum_ratio += top_diff[head * step] * top_data[head * step] / - scale[head * step]; - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - bottom_diff[(head - post_pad) * step] += - top_diff[(head - post_pad) * step] * - pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(head - post_pad) * step] * accum_ratio; - ++head; - } - // subtract only - while (head < channels + post_pad) { - accum_ratio -= top_diff[(head - size) * step] * - top_data[(head - size) * step] / scale[(head - size) * step]; - bottom_diff[(head - post_pad) * step] += - top_diff[(head - post_pad) * step] * - pow(scale[(head - post_pad) * step], negative_beta) - cache_ratio * - bottom_data[(head - post_pad) * step] * accum_ratio; - ++head; - } - } -} - -void hl_CMRNorm_backward(size_t frameCnt, const real* inV, - const real* scale, - const real* outV, const real* outDiff, - real *inDiff, size_t channels, - size_t height, size_t width, size_t sizeX, - real alpha, real beta) { - size_t threadsNum = frameCnt * height * width; - size_t blocksX = (threadsNum + 1024 - 1) / 1024; - size_t blocksY = 1; - dim3 threads(1024, 1); - dim3 grid(blocksX, blocksY); - KeCMRNormDiff <<>> - (threadsNum, inV, outV, scale, outDiff, channels, - height, width, sizeX, alpha, beta, inDiff); - CHECK_SYNC("hl_CMRNorm_backward"); -} - __global__ void KeBilinearInterpFw(const real* in, const size_t inImgH, const size_t inImgW, diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt new file mode 100644 index 0000000000000..0697842bbef62 --- /dev/null +++ b/paddle/function/CMakeLists.txt @@ -0,0 +1,27 @@ +file(GLOB h_files . *_op.h) +file(GLOB cpp_files . *_op.cpp) + +list(APPEND h_files Function.h) +list(APPEND cpp_files Function.cpp) + +if(WITH_GPU) + file(GLOB cu_files . *_op_gpu.cu) + cuda_compile(cu_objs ${cu_files}) +endif() + +add_library(paddle_function STATIC ${cpp_files} ${cu_objs}) + +add_library(paddle_test_main STATIC TestMain.cpp) + +if(WITH_GPU) + # TODO: + # file(GLOB test_files . *_op_test.cpp) + # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files}) + add_simple_unittest(cross_map_normal_op_test) +endif() + +add_style_check_target(paddle_function ${h_files}) +add_style_check_target(paddle_function ${cpp_files}) +if(WITH_GPU) + add_style_check_target(paddle_function ${cu_files}) +endif() diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp new file mode 100644 index 0000000000000..02880e5ea1acb --- /dev/null +++ b/paddle/function/Function.cpp @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Function.h" + +namespace paddle { + +template <> +size_t FuncConfig::get(const std::string& key) const { + auto it = valueMap_.find(key); + CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; + return it->second.s; +} + +template <> +real FuncConfig::get(const std::string& key) const { + auto it = valueMap_.find(key); + CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'"; + return it->second.r; +} + +template <> +FuncConfig& FuncConfig::set(const std::string& key, size_t v) { + CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key; + valueMap_[key].s = v; + return *this; +} + +template <> +FuncConfig& FuncConfig::set(const std::string& key, real v) { + CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key; + valueMap_[key].r = v; + return *this; +} + +ClassRegistrar FunctionBase::funcRegistrar_; + +} // namespace paddle diff --git a/paddle/function/Function.h b/paddle/function/Function.h new file mode 100644 index 0000000000000..095584c0b19f7 --- /dev/null +++ b/paddle/function/Function.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/math/Matrix.h" +#include "paddle/utils/ClassRegistrar.h" + +namespace paddle { + +enum DeviceType { + DEVICE_TYPE_UNSPECIFIED = 0, + DEVICE_TYPE_CPU = 1, + DEVICE_TYPE_GPU = 2, +}; + +template +struct MatrixT; + +template <> +struct MatrixT { + using type = CpuMatrix; +}; + +template <> +struct MatrixT { + using type = GpuMatrix; +}; + +typedef std::vector Dims; + +class Tensor { +public: + Tensor(real* data, const Dims& dim) : buf_(data), dims_(dim) {} + + real* getData() const { return buf_; } + + real* buf_; + Dims dims_; +}; + +typedef std::vector Arguments; + +class FuncConfig { +public: + union value { + size_t s; + real r; + }; + + template + T get(const std::string& key) const; + + template + FuncConfig& set(const std::string& key, T v); + +protected: + std::map valueMap_; +}; + +class FunctionBase { +public: + virtual ~FunctionBase() {} + + virtual void init(const FuncConfig& config) {} + + virtual void calc(const Arguments& inputs, + const Arguments& outputs, + const Arguments& inouts) {} + + static ClassRegistrar funcRegistrar_; +}; + +#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName + +#define REGISTER_TYPED_FUNC(typeName, deviceName, className) \ + static InitFunction __reg_type_##typeName##deviceName([]() { \ + FunctionBase::funcRegistrar_ \ + .registerClass>( \ + FUNC_NAME(typeName, deviceName)); \ + }) + +} // namespace paddle diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h new file mode 100644 index 0000000000000..a8c5e412bd12d --- /dev/null +++ b/paddle/function/FunctionTest.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Function.h" +#include "paddle/math/Vector.h" +#include "paddle/math/tests/TensorCheck.h" + +namespace paddle { + +class FunctionCompare { +public: + FunctionCompare(const std::string& name, const FuncConfig& config) + : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")), + gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) { + cpu->init(config); + gpu->init(config); + } + + void cmpWithArg(const Arguments& inputs, + const Arguments& outputs, + const Arguments& inouts) { + // init cpu and gpu arguments + auto initArgs = [=]( + Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) { + for (auto arg : inArgs) { + size_t size = sizeof(real); + for (auto dim : arg.dims_) { + size *= dim; + } + cpuMemory.emplace_back(std::make_shared(size)); + gpuMemory.emplace_back(std::make_shared(size)); + cpuArgs.emplace_back( + Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_)); + gpuArgs.emplace_back( + Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_)); + + // will use an api to refactor this code. + CpuVector cpuVector(size / sizeof(real), + (real*)cpuArgs.back().getData()); + GpuVector gpuVector(size / sizeof(real), + (real*)gpuArgs.back().getData()); + cpuVector.uniform(0.001, 1); + gpuVector.copyFrom(cpuVector); + } + }; + initArgs(cpuInputs, gpuInputs, inputs); + initArgs(cpuOutputs, gpuOutputs, outputs); + initArgs(cpuInouts, gpuInouts, inouts); + + // function calculate + cpu->calc(cpuInputs, cpuOutputs, cpuInouts); + gpu->calc(gpuInputs, gpuOutputs, gpuInouts); + + // check outputs and inouts + auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) { + for (size_t i = 0; i < cpuArgs.size(); i++) { + auto cpu = cpuArgs[i]; + auto gpu = gpuArgs[i]; + size_t size = 1; + for (auto dim : cpu.dims_) { + size *= dim; + } + CpuVector cpuVector(size, (real*)cpu.getData()); + GpuVector gpuVector(size, (real*)gpu.getData()); + + autotest::TensorCheckErr(cpuVector, gpuVector); + } + }; + checkArgs(cpuOutputs, gpuOutputs); + checkArgs(cpuInouts, gpuInouts); + } + +protected: + std::shared_ptr cpu; + std::shared_ptr gpu; + std::vector cpuMemory; + std::vector gpuMemory; + Arguments cpuInputs; + Arguments cpuOutputs; + Arguments cpuInouts; + Arguments gpuInputs; + Arguments gpuOutputs; + Arguments gpuInouts; +}; + +} // namespace paddle + +using paddle::FunctionCompare; +using paddle::FuncConfig; +using paddle::Dims; +using paddle::Tensor; diff --git a/paddle/function/TestMain.cpp b/paddle/function/TestMain.cpp new file mode 100644 index 0000000000000..3e14532d1878f --- /dev/null +++ b/paddle/function/TestMain.cpp @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/utils/Util.h" + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + paddle::initMain(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/function/cross_map_normal_op.cpp b/paddle/function/cross_map_normal_op.cpp new file mode 100644 index 0000000000000..a9c7693830542 --- /dev/null +++ b/paddle/function/cross_map_normal_op.cpp @@ -0,0 +1,227 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "cross_map_normal_op.h" +#include "paddle/math/Vector.h" + +namespace paddle { + +template <> +void CrossMapNormal(real* outputs, + real* denoms, + const real* inputs, + size_t numSamples, + size_t channels, + size_t height, + size_t width, + size_t size, + real scale, + real pow) { + size_t oneImage = height * width; + size_t oneSample = channels * oneImage; + + CpuVector outputsV(numSamples * oneSample, outputs); + CpuVector inputsV(numSamples * oneSample, const_cast(inputs)); + CpuVector denomsV(numSamples * oneSample, denoms); + + // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow) + // x represents inputs + // f(x) represents outputs + // denoms save the intermediate result for backward + denomsV = denomsV.constant(1.0); + const int start = -((int)size - 1) / 2; + const int end = (int)size + start; + for (size_t i = 0; i < numSamples; i++) { + real* oneDenom = denoms + i * oneSample; + real* oneInput = const_cast(inputs) + i * oneSample; + for (int c = 0; c < (int)channels; c++) { + CpuVector denom(oneImage, oneDenom + c * oneImage); + for (int s = start; s < end; s++) { + if (c + s >= 0 && c + s < (int)channels) { + CpuVector input(oneImage, oneInput + (c + s) * oneImage); + denom += input.square() * scale; + } + } + } + } + + outputsV = inputsV * denomsV.pow(-pow); +} + +template <> +void CrossMapNormalGrad(real* inputsGrad, + const real* inputsValue, + const real* outputsValue, + const real* outputsGrad, + const real* denoms, + size_t numSamples, + size_t channels, + size_t height, + size_t width, + size_t size, + real scale, + real pow) { + size_t oneSample = channels * height * width; + std::function oneImage = [=](real* data, + size_t offset) { + return CpuVector(height * width, data + offset); + }; + + const int start = -((int)size) / 2; + const int end = (int)size + start; + const real ratio = -(real)2 * scale * pow; + for (size_t i = 0; i < numSamples; i++) { + size_t sOffset = i * oneSample; + real* oneInputGrad = inputsGrad + sOffset; + real* oneInputValue = const_cast(inputsValue) + sOffset; + real* oneDenom = const_cast(denoms) + sOffset; + real* oneOutputGrad = const_cast(outputsGrad) + sOffset; + real* oneOutputValue = const_cast(outputsValue) + sOffset; + + for (int c = 0; c < (int)channels; c++) { + size_t cOffset = c * height * width; + CpuVector inputGrad = oneImage(oneInputGrad, cOffset); + CpuVector inputValue = oneImage(oneInputValue, cOffset); + CpuVector denom = oneImage(oneDenom, cOffset); + CpuVector outputGrad = oneImage(oneOutputGrad, cOffset); + + inputGrad = inputGrad + denom.pow(-pow) * outputGrad; + for (int s = start; s < end; s++) { + if (c + s >= 0 && c + s < (int)channels) { + size_t offset = (c + s) * height * width; + CpuVector output = oneImage(oneOutputValue, offset); + CpuVector outputGrad = oneImage(oneOutputGrad, offset); + CpuVector denom = oneImage(oneDenom, offset); + + inputGrad += ((outputGrad * output * ratio) / denom) * inputValue; + } + } + } + } +} + +/** + * \param inputs[0] input value. + * \param outputs[0] output value. + * \param outputs[1] denoms. + */ +template +class CrossMapNormalFunc : public FunctionBase { +public: + void init(const FuncConfig& config) override { + size_ = config.get("size"); + scale_ = config.get("scale"); + pow_ = config.get("pow"); + } + + void calc(const Arguments& inputs, + const Arguments& outputs, + const Arguments& inouts) override { + CHECK_EQ(1, inputs.size()); + CHECK_EQ(2, outputs.size()); + CHECK_EQ(0, inouts.size()); + + CHECK_EQ(inputs[0].dims_.size(), 4); + for (size_t i = 0; i < inputs[0].dims_.size(); i++) { + CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]); + CHECK_EQ(inputs[0].dims_[i], outputs[1].dims_[i]); + } + + size_t samples = inputs[0].dims_[0]; + size_t channels = inputs[0].dims_[1]; + size_t height = inputs[0].dims_[2]; + size_t width = inputs[0].dims_[3]; + + CrossMapNormal(outputs[0].getData(), + outputs[1].getData(), + inputs[0].getData(), + samples, + channels, + height, + width, + size_, + scale_, + pow_); + } + +private: + size_t size_; + real scale_; + real pow_; +}; + +/** + * \param inputs[0] input value. + * \param inputs[1] output value. + * \param inputs[2] output grad. + * \param inputs[3] denoms. + * \param outputs[0] input grad. + */ +template +class CrossMapNormalGradFunc : public FunctionBase { +public: + void init(const FuncConfig& config) override { + size_ = config.get("size"); + scale_ = config.get("scale"); + pow_ = config.get("pow"); + } + + void calc(const Arguments& inputs, + const Arguments& outputs, + const Arguments& inouts) override { + CHECK_EQ(4, inputs.size()); + CHECK_EQ(1, outputs.size()); + CHECK_EQ(0, inouts.size()); + + CHECK_EQ(inputs[0].dims_.size(), 4); + for (size_t i = 0; i < inputs[0].dims_.size(); i++) { + CHECK_EQ(inputs[0].dims_[i], inputs[1].dims_[i]); + CHECK_EQ(inputs[0].dims_[i], inputs[2].dims_[i]); + CHECK_EQ(inputs[0].dims_[i], inputs[3].dims_[i]); + CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]); + } + + size_t samples = inputs[0].dims_[0]; + size_t channels = inputs[0].dims_[1]; + size_t height = inputs[0].dims_[2]; + size_t width = inputs[0].dims_[3]; + + CrossMapNormalGrad(outputs[0].getData(), + inputs[0].getData(), + inputs[1].getData(), + inputs[2].getData(), + inputs[3].getData(), + samples, + channels, + height, + width, + size_, + scale_, + pow_); + } + +private: + size_t size_; + real scale_; + real pow_; +}; + +REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc); +REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc); +#ifndef PADDLE_ONLY_CPU +REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc); +REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc); +#endif + +} // namespace paddle diff --git a/paddle/function/cross_map_normal_op.h b/paddle/function/cross_map_normal_op.h new file mode 100644 index 0000000000000..b1e401ad0a2f5 --- /dev/null +++ b/paddle/function/cross_map_normal_op.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Function.h" + +namespace paddle { + +/** + * \brief Cross map respose normalize forward. + * The data structure of image data is NCHW. + * + * \param[out] outputs output data. + * \param[in] denoms denoms buffer. + * \param[in] inputs input data. + * \param[in] numSamples batch size of input image. + * \param[in] channels number of channel. + * \param[in] height image height. + * \param[in] width image width. + * \param[in] size size. + * \param[in] scale scale. + * \param[in] pow scale. + * + */ +template +void CrossMapNormal(real* outputs, + real* denoms, + const real* inputs, + size_t numSamples, + size_t channels, + size_t height, + size_t width, + size_t size, + real scale, + real pow); + +/** + * \brief Cross map respose normalize backward. + * The data structure of image data is NCHW. + * + * \param[out] inputsGrad input grad. + * \param[in] inputsValue input value. + * \param[out] outputsValue output value. + * \param[out] outputsGrad output grad. + * \param[in] denoms denoms buffer. + * \param[in] numSamples batch size of input image. + * \param[in] channels number of channel. + * \param[in] height image height. + * \param[in] width image width. + * \param[in] size size. + * \param[in] scale scale. + * \param[in] pow scale. + * + */ +template +void CrossMapNormalGrad(real* inputsGrad, + const real* inputsValue, + const real* outputsValue, + const real* outputsGrad, + const real* denoms, + size_t numSamples, + size_t channels, + size_t height, + size_t width, + size_t size, + real scale, + real pow); + +} // namespace paddle diff --git a/paddle/function/cross_map_normal_op_gpu.cu b/paddle/function/cross_map_normal_op_gpu.cu new file mode 100644 index 0000000000000..aae4f461b6f57 --- /dev/null +++ b/paddle/function/cross_map_normal_op_gpu.cu @@ -0,0 +1,156 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "hl_base.h" +#include "cross_map_normal_op.h" + +namespace paddle { + +__global__ void KeCMRNormFillScale(size_t imageSize, const real* in, + real* scale, size_t channels, + size_t height, size_t width, size_t size, + real alpha) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < imageSize) { + const int w = idx % width; + const int h = (idx / width) % height; + const int n = idx / width / height; + const int offset = (n * channels * height + h) * width + w; + + in += offset; + scale += offset; + const int step = height * width; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + + real accum = 0; + int index = 0; + while (index < channels + post_pad) { + if (index < channels) { + accum += in[index * step] * in[index * step]; + } + if (index >= size) { + accum -= in[(index - size) * step] * in[(index - size) * step]; + } + if (index >= post_pad) { + scale[(index - post_pad) * step] = 1. + accum * alpha; + } + ++index; + } + } +} + +__global__ void KeCMRNormOutput(size_t inputSize, const real* in, + const real* scale, real negative_beta, + real* out) { + const int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < inputSize) { + out[index] = in[index] * pow(scale[index], negative_beta); + } +} + +template <> +void CrossMapNormal(real* outputs, + real* denoms, + const real* inputs, + size_t numSamples, + size_t channels, + size_t height, + size_t width, + size_t size, + real scale, + real pow) { + size_t imageSize = numSamples * height * width; + int blockSize = 1024; + int gridSize = (imageSize + 1024 - 1) / 1024; + KeCMRNormFillScale<<>> + (imageSize, inputs, denoms, channels, height, width, size, scale); + + size_t inputSize = numSamples * height * width *channels; + blockSize = 1024; + gridSize = (inputSize + 1024 - 1) / 1024; + KeCMRNormOutput<<>> + (inputSize, inputs, denoms, -pow, outputs); + + CHECK_SYNC("CrossMapNormal"); +} + +__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data, + const real* top_data, const real* scale, + const real* top_diff, size_t channels, + size_t height, size_t width, size_t size, + real negative_beta, real cache_ratio, + real* bottom_diff ) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < imageSize) { + const int w = idx % width; + const int h = (idx / width) % height; + const int n = idx / width / height; + const int offset = (n * channels * height + h) * width + w; + bottom_data += offset; + top_data += offset; + scale += offset; + top_diff += offset; + bottom_diff += offset; + + const int step = height * width; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + + int index = 0; + real accum = 0; + while (index < channels + post_pad) { + if (index < channels) { + accum += top_diff[index * step] * top_data[index * step] / + scale[index * step]; + } + if (index >= size) { + accum -= top_diff[(index - size) * step] * + top_data[(index - size) * step] / scale[(index - size) * step]; + } + if (index >= post_pad) { + bottom_diff[(index - post_pad) * step] += + top_diff[(index - post_pad) * step] * + pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio * + bottom_data[(index - post_pad) * step] * accum; + } + ++index; + } + } +} + +template <> +void CrossMapNormalGrad(real* inputsGrad, + const real* inputsValue, + const real* outputsValue, + const real* outputsGrad, + const real* denoms, + size_t numSamples, + size_t channels, + size_t height, + size_t width, + size_t size, + real scale, + real pow) { + size_t imageSize = numSamples * height * width; + + int blockSize = 1024; + int gridSize = (imageSize + 1024 - 1) / 1024; + KeCMRNormDiff <<>> + (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels, + height, width, size, -pow, 2.0f * pow * scale, inputsGrad); + CHECK_SYNC("CrossMapNormalGrad"); +} + +} // namespace paddle diff --git a/paddle/function/cross_map_normal_op_test.cpp b/paddle/function/cross_map_normal_op_test.cpp new file mode 100644 index 0000000000000..22692691bdb64 --- /dev/null +++ b/paddle/function/cross_map_normal_op_test.cpp @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "FunctionTest.h" + +TEST(CrossMapNormal, real) { + for (size_t numSamples : {5, 32}) { + for (size_t channels : {1, 5, 32}) { + for (size_t imgSizeH : {5, 33, 100}) { + for (size_t imgSizeW : {5, 32, 96}) { + for (size_t size : {1, 2, 3, 5, 7}) { + VLOG(3) << " numSamples=" << numSamples << " channels=" << channels + << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW + << " size=" << size; + + FunctionCompare compare("CrossMapNormal", + FuncConfig() + .set("size", size) + .set("scale", (real)1.5) + .set("pow", (real)0.5)); + Dims dims{numSamples, channels, imgSizeH, imgSizeW}; + compare.cmpWithArg({Tensor(nullptr, dims)}, + {Tensor(nullptr, dims), Tensor(nullptr, dims)}, + {}); + } + } + } + } + } +} + +TEST(CrossMapNormalGrad, real) { + for (size_t numSamples : {5, 32}) { + for (size_t channels : {1, 5, 32}) { + for (size_t imgSizeH : {5, 33, 100}) { + for (size_t imgSizeW : {5, 32, 96}) { + for (size_t size : {1, 2, 3, 5, 7}) { + VLOG(3) << " numSamples=" << numSamples << " channels=" << channels + << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW + << " size=" << size; + + FunctionCompare compare("CrossMapNormalGrad", + FuncConfig() + .set("size", size) + .set("scale", (real)1.5) + .set("pow", (real)0.5)); + Dims dims{numSamples, channels, imgSizeH, imgSizeW}; + compare.cmpWithArg({Tensor(nullptr, dims), + Tensor(nullptr, dims), + Tensor(nullptr, dims), + Tensor(nullptr, dims)}, + {Tensor(nullptr, dims)}, + {}); + } + } + } + } + } +} diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt index a066f80c221ee..4f92150ec84d6 100644 --- a/paddle/gserver/CMakeLists.txt +++ b/paddle/gserver/CMakeLists.txt @@ -27,16 +27,12 @@ if(NOT WITH_GPU) list(REMOVE_ITEM GSERVER_HEADER layers/CudnnConvLayer.h layers/CudnnPoolLayer.h - layers/CudnnBatchNormLayer.h - layers/NormProjectionLayer.h - layers/NormLayer.h) + layers/CudnnBatchNormLayer.h) list(REMOVE_ITEM GSERVER_SOURCES layers/CudnnConvLayer.cpp layers/CudnnPoolLayer.cpp - layers/CudnnBatchNormLayer.cpp - layers/NormProjectionLayer.cpp - layers/NormLayer.cpp) + layers/CudnnBatchNormLayer.cpp) compile_cu_as_cpp(layers/LstmCompute.cu) compile_cu_as_cpp(layers/GruCompute.cu) endif() diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp index 2f9928191170a..ae7508e2bb117 100644 --- a/paddle/gserver/evaluators/Evaluator.cpp +++ b/paddle/gserver/evaluators/Evaluator.cpp @@ -78,7 +78,7 @@ class ClassificationErrorEvaluator : public Evaluator { useGpu(arguments[0].deviceId)); errorMat->zeroMem(); if (label != nullptr) { - errorMat->classificationError(output, label); + errorMat->classificationError(*output, *label); } else if (dynamic_cast(multiBinaryLabel.get()) || dynamic_cast(multiBinaryLabel.get())) { errorMat->classificationErrorMulti( diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp index 7ac56e3a2ab2a..51c0ae5cc9523 100644 --- a/paddle/gserver/layers/ContextProjection.cpp +++ b/paddle/gserver/layers/ContextProjection.cpp @@ -90,8 +90,8 @@ void ContextProjection::forward() { REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str()); bool isPadding = config_.trainable_padding(); out_->value->contextProjectionForward( - in_->value, - state_ ? state_ : isPadding ? weight_->getW() : nullptr, + *(in_->value), + state_ ? state_.get() : isPadding ? weight_->getW().get() : nullptr, *startPositions, config_.context_length(), config_.context_start(), @@ -128,8 +128,8 @@ void ContextProjection::backward(const UpdateCallback& callback) { bool isPadding = config_.trainable_padding(); if (!out_->grad->useGpu()) { out_->grad->contextProjectionBackward( - in_->grad, - isPadding ? weight_->getWGrad() : nullptr, + in_->grad.get(), + isPadding ? weight_->getWGrad().get() : nullptr, *startPositions, config_.context_length(), config_.context_start(), @@ -137,7 +137,7 @@ void ContextProjection::backward(const UpdateCallback& callback) { isPadding); } else { if (in_->grad) { - out_->grad->contextProjectionBackwardData(in_->grad, + out_->grad->contextProjectionBackwardData(*(in_->grad), *startPositions, config_.context_length(), config_.context_start()); @@ -145,7 +145,7 @@ void ContextProjection::backward(const UpdateCallback& callback) { if (isPadding && weight_->getWGrad()) { out_->grad->contextProjectionBackwardWeight( - weight_->getWGrad(), + *(weight_->getWGrad()), *startPositions, config_.context_length(), config_.context_start(), diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp index 3f4d77a2fe069..ed57f2af3c645 100644 --- a/paddle/gserver/layers/ConvexCombinationLayer.cpp +++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp @@ -113,7 +113,7 @@ void ConvexCombinationLayer::forward(PassType passType) { tmpRow0->setData(inV0->getData() + i * weightDim); tmpRow1->setData(outV->getData() + i * dataDim); - tmpRow1->mul(tmpRow0, tmpMtx0, 1, 0); + tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0); } } @@ -136,7 +136,7 @@ void ConvexCombinationLayer::backward(const UpdateCallback& callback) { tmpRow1->setData(outG->getData() + i * dataDim); tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim); - tmpRow0->mul(tmpRow1, tmpMtx0->getTranspose(), 1, 1); + tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1); } } @@ -146,7 +146,7 @@ void ConvexCombinationLayer::backward(const UpdateCallback& callback) { tmpRow1->setData(outG->getData() + i * dataDim); tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim); - tmpMtx0->mul(tmpRow0->getTranspose(), tmpRow1, 1, 1); + tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1); } } } diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp index 25948747fe93e..9ddccc202705c 100644 --- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp +++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp @@ -150,7 +150,7 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, Matrix::create(wgtData, subM, subK, false, useGpu_); // mark transpose MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_); MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_); - C->mul(A, B, 1, 1); + C->mul(*A, *B, 1, 1); A->clear(); B->clear(); @@ -185,7 +185,7 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_); MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_); MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_); - C->mul(A, B); // mul + C->mul(*A, *B); // mul // clear the temporary matrix A->clear(); @@ -252,7 +252,7 @@ void ExpandConvBaseLayer::bpropWeights(MatrixPtr image, MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_); MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_); MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_); - C->mul(B, A, 1, 1); + C->mul(*B, *A, 1, 1); A->clear(); B->clear(); diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp index 9e72a33a3c6f4..b8b6f403d6a02 100644 --- a/paddle/gserver/layers/FullMatrixProjection.cpp +++ b/paddle/gserver/layers/FullMatrixProjection.cpp @@ -28,7 +28,7 @@ FullMatrixProjection::FullMatrixProjection(const ProjectionConfig& config, void FullMatrixProjection::forward() { REGISTER_TIMER_INFO("FwMulTimer", getName().c_str()); - out_->value->mul(in_->value, weight_->getW(), 1, 1); + out_->value->mul(*(in_->value), *(weight_->getW()), 1, 1); } void FullMatrixProjection::backward(const UpdateCallback& callback) { @@ -37,7 +37,8 @@ void FullMatrixProjection::backward(const UpdateCallback& callback) { /* Calculate the W-gradient for the current layer */ if (weight_->getWGrad()) { REGISTER_TIMER_INFO("GradMulTimer", getName().c_str()); - weight_->getWGrad()->mul(in_->value->getTranspose(), out_->grad, 1, 1); + weight_->getWGrad()->mul( + *(in_->value->getTranspose()), *(out_->grad), 1, 1); } // If callback does not change value, backward propagation error @@ -47,7 +48,7 @@ void FullMatrixProjection::backward(const UpdateCallback& callback) { /* Calculate the input layers error */ if (in_->grad) { REGISTER_TIMER_INFO("BpMulTimer", getName().c_str()); - in_->grad->mul(out_->grad, weight_->getW()->getTranspose(), 1, 1); + in_->grad->mul(*(out_->grad), *(weight_->getW()->getTranspose()), 1, 1); } hl_set_sync_flag(syncFlag); diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp index 89afe33c36697..d8a667ff8dc02 100644 --- a/paddle/gserver/layers/FullyConnectedLayer.cpp +++ b/paddle/gserver/layers/FullyConnectedLayer.cpp @@ -84,8 +84,8 @@ void FullyConnectedLayer::forward(PassType passType) { auto input = getInput(i); CHECK(input.value) << "The input of 'fc' layer must be matrix"; REGISTER_TIMER_INFO("FwMulTimer", getName().c_str()); - i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0) - : outV->mul(input.value, weights_[i]->getW(), 1, 1); + i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0) + : outV->mul(*input.value, *weights_[i]->getW(), 1, 1); } /* add the bias-vector */ @@ -123,7 +123,7 @@ void FullyConnectedLayer::backward(const UpdateCallback& callback) { MatrixPtr oGrad = getOutputGrad(); { REGISTER_TIMER_INFO("GradMulTimer", getName().c_str()); - weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1); + weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1); } } @@ -136,7 +136,7 @@ void FullyConnectedLayer::backward(const UpdateCallback& callback) { if (NULL != preGrad) { MatrixPtr weights_T = weights_[i]->getW()->getTranspose(); REGISTER_TIMER_INFO("BpMulTimer", getName().c_str()); - preGrad->mul(getOutputGrad(), weights_T, 1, 1); + preGrad->mul(*getOutputGrad(), *weights_T, 1, 1); } hl_set_sync_flag(syncFlag); diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h index 172e558b82945..6dfd48fb96618 100644 --- a/paddle/gserver/layers/Layer.h +++ b/paddle/gserver/layers/Layer.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include "ModelConfig.pb.h" +#include "paddle/function/Function.h" #include "paddle/math/CpuSparseMatrix.h" #include "paddle/parameter/Parameter.h" #include "paddle/utils/ClassRegistrar.h" @@ -100,6 +101,11 @@ class Layer { /// Mark input grad in(true) or out(false) of backward function. std::vector markInBackward_; + /// Layer forward function + std::vector> forward_; + /// Layer backward function + std::vector> backward_; + public: /** * Wait until all input value ready. @@ -126,6 +132,26 @@ class Layer { virtual void markAllInputGrad(); protected: + /** + * Create layer function. Function is called in forward or backward. + * \param function, Layer::forward_ or Layer::backward_ + * \param name, function name + * \param config, initialization configuration for the function + */ + void createFunction(std::vector>& function, + const std::string& name, + const FuncConfig& config) { + if (useGpu_) { + function.emplace_back( + FunctionBase::funcRegistrar_.createByType(name + "-GPU")); + } else { + function.emplace_back( + FunctionBase::funcRegistrar_.createByType(name + "-CPU")); + } + auto& func = function.back(); + func->init(config); + } + /** * Notify specified layer the output grad ready. * Called in the backward function. diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp index af550c7a01548..b7f748f3bb8a4 100644 --- a/paddle/gserver/layers/LinearChainCRF.cpp +++ b/paddle/gserver/layers/LinearChainCRF.cpp @@ -59,7 +59,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) { matX->rowMax(*maxX_); expX_->assign(*matX); // subtract max to avoid overflow or underflow - expX_->mul(maxX_, ones_, (real)-1, (real)1); + expX_->mul(*maxX_, *ones_, (real)-1, (real)1); expX_->exp2(); real* a = a_->getData(); diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp index 2543d1b49a801..01cc5fec8b970 100644 --- a/paddle/gserver/layers/LstmLayer.cpp +++ b/paddle/gserver/layers/LstmLayer.cpp @@ -316,7 +316,7 @@ void LstmLayer::forwardSequence(int batchSize, } if (prevOutput_) { frameGate->setData(lstmValue.gateValue); - frameGate->mul(prevOutput_, weight_->getW(), 1, 1); + frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1); } } AsyncGpuBlock asyncGpuBlock; @@ -338,7 +338,7 @@ void LstmLayer::forwardSequence(int batchSize, frameOutput->setData(lstmValue.outputValue); nextFrame(reversed_, getSize()); frameGate->setData(lstmValue.gateValue); - frameGate->mul(frameOutput, weight_->getW(), 1, 1); + frameGate->mul(*frameOutput, *weight_->getW(), 1, 1); } } if (n != numSequences - 1) { @@ -348,7 +348,7 @@ void LstmLayer::forwardSequence(int batchSize, if (!reversed_) { if (!prevState_) lstmValue.prevStateValue = nullptr; if (prevOutput_) { - frameGate->mul(frameOutput, weight_->getW(), 1, 1); + frameGate->mul(*frameOutput, *weight_->getW(), 1, 1); } } else { lstmValue.prevStateValue = nullptr; @@ -470,7 +470,7 @@ void LstmLayer::backwardSequence(int batchSize, frameGate->setData(lstmGrad.gateGrad); nextFrame(reversed_, getSize()); frameOutput->setData(lstmGrad.outputGrad); - frameOutput->mul(frameGate, weightT, 1, 1); + frameOutput->mul(*frameGate, *weightT, 1, 1); } else { nextFrame(reversed_, getSize()); } @@ -479,14 +479,14 @@ void LstmLayer::backwardSequence(int batchSize, if (weight_->getWGrad()) { if (!reversed_) { weight_->getWGrad()->mul( - output_.value->subMatrix(start, length - 1)->getTranspose(), - gate_.grad->subMatrix(start + 1, length - 1), + *output_.value->subMatrix(start, length - 1)->getTranspose(), + *gate_.grad->subMatrix(start + 1, length - 1), 1, 1); } else { weight_->getWGrad()->mul( - output_.value->subMatrix(start + 1, length - 1)->getTranspose(), - gate_.grad->subMatrix(start, length - 1), + *output_.value->subMatrix(start + 1, length - 1)->getTranspose(), + *gate_.grad->subMatrix(start, length - 1), 1, 1); } @@ -541,7 +541,7 @@ void LstmLayer::forwardBatch(int batchSize, if (n != 0) { MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize); - gateValue->mul(batch1, weight_->getW(), 1, 1); + gateValue->mul(*batch1, *weight_->getW(), 1, 1); } else if (prevOutput_) { Matrix::resizeOrCreate(prevBatchOutput2_, gateValue->getHeight(), @@ -549,7 +549,7 @@ void LstmLayer::forwardBatch(int batchSize, false, useGpu_); batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_); - gateValue->mul(prevBatchOutput2_, weight_->getW(), 1, 1); + gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1); batchValue_->prevOutput2Batch(*prevState_, *totalState_->subMatrix(0, numSequences)); @@ -672,16 +672,16 @@ void LstmLayer::backwardBatch(int batchSize, if (n != 0) { MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize); - tmp->mul(gateGrad, weightT, 1, 1); + tmp->mul(*gateGrad, *weightT, 1, 1); } if (n != 0 && weight_->getWGrad()) { /* backward weight */ MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize); - weight_->getWGrad()->mul(outputValue->getTranspose(), gateGrad, 1, 1); + weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1); } else if (prevOutput_ && weight_->getWGrad()) { weight_->getWGrad()->mul( - prevBatchOutput2_->getTranspose(), gateGrad, 1, 1); + *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1); } } } diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp index 1243c12889542..fb41af5631954 100644 --- a/paddle/gserver/layers/MDLstmLayer.cpp +++ b/paddle/gserver/layers/MDLstmLayer.cpp @@ -547,7 +547,7 @@ void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) { if (coordIter.getPrePos(delays_, i, prePos)) { int preOffset = coordIter.offset(prePos); frameGate_[start + offset].value->mul( - frameOutput_[start + preOffset].value, weight_->getW(), 1.0, 1.0); + *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0); } } forwardGate2OutputSequence(start, coordIter); @@ -747,11 +747,11 @@ void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) { if (coordIter.getPrePos(delays_, i, prePos)) { int preOffset = coordIter.offset(prePos); frameOutput_[start + preOffset].grad->mul( - frameGate_[start + offset].grad, weightT, 1.0, 1.0); + *frameGate_[start + offset].grad, *weightT, 1.0, 1.0); if (weight_->getWGrad()) { weight_->getWGrad()->mul( - frameOutput_[start + preOffset].value->getTranspose(), - frameGate_[start + offset].grad, + *frameOutput_[start + preOffset].value->getTranspose(), + *frameGate_[start + offset].grad, 1.0, 1.0); } diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h index 86255b231b1ee..011bab8fdedab 100644 --- a/paddle/gserver/layers/NormLayer.h +++ b/paddle/gserver/layers/NormLayer.h @@ -50,7 +50,7 @@ class NormLayer : public Layer { class ResponseNormLayer : public NormLayer { protected: size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_; - float scale_, pow_; + real scale_, pow_; MatrixPtr denoms_; public: diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp index 934fc31e0acf9..262d757c67e10 100644 --- a/paddle/gserver/layers/NormProjectionLayer.cpp +++ b/paddle/gserver/layers/NormProjectionLayer.cpp @@ -45,6 +45,15 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap, /* the size of inputs for norm-layer is 1 */ CHECK_EQ(config_.inputs_size(), 1); + createFunction( + forward_, + "CrossMapNormal", + FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_)); + createFunction( + backward_, + "CrossMapNormalGrad", + FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_)); + return true; } @@ -54,7 +63,7 @@ void CMRProjectionNormLayer::forward(PassType passType) { /* malloc memory for the output_ if necessary */ /* note: one sample correspond to one row */ MatrixPtr input = inputLayers_[0]->getOutputValue(); - int batchSize = input->getHeight(); + size_t batchSize = input->getHeight(); int size = getSize(); resetOutput(batchSize, size); @@ -62,10 +71,11 @@ void CMRProjectionNormLayer::forward(PassType passType) { Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_); - denoms_->zeroMem(); - - outV->crossMapNormalFwd( - *input, imgSizeH_, imgSizeW_, *denoms_, channels_, size_, scale_, pow_); + dims_ = {batchSize, channels_, imgSizeH_, imgSizeW_}; + forward_[0]->calc( + {Tensor(input->getData(), dims_)}, + {Tensor(outV->getData(), dims_), Tensor(denoms_->getData(), dims_)}, + {}); } void CMRProjectionNormLayer::backward(const UpdateCallback& callback) { @@ -80,15 +90,11 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) { MatrixPtr localOutV = getOutputValue(); MatrixPtr preOutV = inputLayers_[0]->getOutputValue(); - preOutGrad->crossMapNormalBwd(*localGrad, - *denoms_, - *preOutV, - *localOutV, - channels_, - imgSizeH_, - imgSizeW_, - size_, - scale_, - pow_); + backward_[0]->calc({Tensor(preOutV->getData(), dims_), + Tensor(localOutV->getData(), dims_), + Tensor(localGrad->getData(), dims_), + Tensor(denoms_->getData(), dims_)}, + {Tensor(preOutGrad->getData(), dims_)}, + {}); } } // namespace paddle diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h index 4f7b638334afe..6b2c5dde0d74d 100644 --- a/paddle/gserver/layers/NormProjectionLayer.h +++ b/paddle/gserver/layers/NormProjectionLayer.h @@ -39,5 +39,8 @@ class CMRProjectionNormLayer : public ResponseNormLayer { bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); void forward(PassType passType); void backward(const UpdateCallback& callback = nullptr); + +protected: + Dims dims_; }; } // namespace paddle diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp index cf9a008318e9d..b606e4436567e 100644 --- a/paddle/gserver/layers/OuterProdLayer.cpp +++ b/paddle/gserver/layers/OuterProdLayer.cpp @@ -96,7 +96,7 @@ void OuterProdLayer::forward(PassType passType) { tmpRow0->setData(inV0->getData() + i * dim0); tmpRow1->setData(inV1->getData() + i * dim1); - tmpMtx0->mul(tmpRow0->getTranspose(), tmpRow1); + tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1); } } } @@ -121,7 +121,7 @@ void OuterProdLayer::backward(const UpdateCallback& callback) { tmpRow0->setData(inG0->getData() + i * dim0); tmpRow1->setData(inV1->getData() + i * dim1); - tmpRow0->mul(tmpRow1, tmpMtx0->getTranspose(), 1, 1); + tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1); } } @@ -131,7 +131,7 @@ void OuterProdLayer::backward(const UpdateCallback& callback) { tmpRow0->setData(inV0->getData() + i * dim0); tmpRow1->setData(inG1->getData() + i * dim1); - tmpRow1->mul(tmpRow0, tmpMtx0, 1, 1); + tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1); } } } diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp new file mode 100644 index 0000000000000..36ace7597cd66 --- /dev/null +++ b/paddle/gserver/layers/PriorBox.cpp @@ -0,0 +1,149 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Layer.h" +#include "paddle/math/BaseMatrix.h" +#include "paddle/math/Matrix.h" + +namespace paddle { +/** + * @brief A layer for generating priorbox locations and variances. + * - Input: Two and only two input layer are accepted. The input layer must be + * be a data output layer and a convolution output layer. + * - Output: The priorbox locations and variances of the input data. + * Reference: + * Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, + * Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector + */ + +class PriorBoxLayer : public Layer { +public: + explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {} + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + + void forward(PassType passType); + void backward(const UpdateCallback& callback) {} + +protected: + int numPriors_; + std::vector minSize_; + std::vector maxSize_; + std::vector aspectRatio_; + std::vector variance_; + MatrixPtr buffer_; +}; + +bool PriorBoxLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + auto pbConf = config_.inputs(0).priorbox_conf(); + std::copy(pbConf.min_size().begin(), + pbConf.min_size().end(), + std::back_inserter(minSize_)); + std::copy(pbConf.max_size().begin(), + pbConf.max_size().end(), + std::back_inserter(maxSize_)); + std::copy(pbConf.aspect_ratio().begin(), + pbConf.aspect_ratio().end(), + std::back_inserter(aspectRatio_)); + std::copy(pbConf.variance().begin(), + pbConf.variance().end(), + std::back_inserter(variance_)); + // flip + int inputRatioLength = aspectRatio_.size(); + for (int index = 0; index < inputRatioLength; index++) + aspectRatio_.push_back(1 / aspectRatio_[index]); + aspectRatio_.push_back(1.); + numPriors_ = aspectRatio_.size(); + if (maxSize_.size() > 0) numPriors_++; + return true; +} + +void PriorBoxLayer::forward(PassType passType) { + Layer::forward(passType); + auto input = getInput(0); + int layerWidth = input.getFrameWidth(); + int layerHeight = input.getFrameHeight(); + + auto image = getInput(1); + int imageWidth = image.getFrameWidth(); + int imageHeight = image.getFrameHeight(); + + real stepW = static_cast(imageWidth) / layerWidth; + real stepH = static_cast(imageHeight) / layerHeight; + int dim = layerHeight * layerWidth * numPriors_ * 4; + reserveOutput(1, dim * 2); + // use a cpu buffer to compute + Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false); + auto* tmpPtr = buffer_->getData(); + + int idx = 0; + for (int h = 0; h < layerHeight; ++h) { + for (int w = 0; w < layerWidth; ++w) { + real centerX = (w + 0.5) * stepW; + real centerY = (h + 0.5) * stepH; + int minSize = 0; + for (size_t s = 0; s < minSize_.size(); s++) { + // first prior. + minSize = minSize_[s]; + int boxWidth = minSize; + int boxHeight = minSize; + // xmin, ymin, xmax, ymax. + tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth; + tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight; + tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth; + tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight; + // set the variance. + for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t]; + + if (maxSize_.size() > 0) { + CHECK_EQ(minSize_.size(), maxSize_.size()); + // second prior. + for (size_t s = 0; s < maxSize_.size(); s++) { + int maxSize = maxSize_[s]; + boxWidth = boxHeight = sqrt(minSize * maxSize); + tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth; + tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight; + tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth; + tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight; + // set the variance. + for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t]; + } + } + } + // rest of priors. + for (size_t r = 0; r < aspectRatio_.size(); r++) { + real ar = aspectRatio_[r]; + if (fabs(ar - 1.) < 1e-6) continue; + real boxWidth = minSize * sqrt(ar); + real boxHeight = minSize / sqrt(ar); + tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth; + tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight; + tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth; + tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight; + // set the variance. + for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t]; + } + } + } + // clip the prior's coordidate such that it is within [0, 1] + for (int d = 0; d < dim * 2; ++d) + if ((d % 8) < 4) + tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.); + MatrixPtr outV = getOutputValue(); + outV->copyFrom(buffer_->data_, dim * 2); +} +REGISTER_LAYER(priorbox, PriorBoxLayer); + +} // namespace paddle diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp index 85812c9d660e0..94b16996a86d2 100644 --- a/paddle/gserver/layers/RecurrentLayer.cpp +++ b/paddle/gserver/layers/RecurrentLayer.cpp @@ -215,12 +215,12 @@ void RecurrentLayer::forwardSequence(int batchSize, void RecurrentLayer::forwardOneSequence(int start, int length) { if (!reversed_) { if (prevOutput_) { - frameOutput_[start].value->mul(prevOutput_, weight_->getW(), 1, 1); + frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1); } activation_->forward(frameOutput_[start]); for (int i = 1; i < length; ++i) { frameOutput_[start + i].value->mul( - frameOutput_[start + i - 1].value, weight_->getW(), 1, 1); + *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1); activation_->forward(frameOutput_[start + i]); } if (prevOutput_) { @@ -230,7 +230,7 @@ void RecurrentLayer::forwardOneSequence(int start, int length) { activation_->forward(frameOutput_[start + length - 1]); for (int i = length - 2; i >= 0; --i) { frameOutput_[start + i].value->mul( - frameOutput_[start + i + 1].value, weight_->getW(), 1, 1); + *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1); activation_->forward(frameOutput_[start + i]); } } @@ -282,13 +282,13 @@ void RecurrentLayer::backwardOneSequence(int start, int length) { for (int i = length - 1; i > 0; --i) { activation_->backward(frameOutput_[start + i]); frameOutput_[start + i - 1].grad->mul( - frameOutput_[start + i].grad, weightT, 1, 1); + *frameOutput_[start + i].grad, *weightT, 1, 1); } activation_->backward(frameOutput_[start]); if (weight_->getWGrad()) { weight_->getWGrad()->mul( - output_.value->subMatrix(start, length - 1)->getTranspose(), - output_.grad->subMatrix(start + 1, length - 1), + *output_.value->subMatrix(start, length - 1)->getTranspose(), + *output_.grad->subMatrix(start + 1, length - 1), 1, 1); } @@ -296,13 +296,13 @@ void RecurrentLayer::backwardOneSequence(int start, int length) { for (int i = 0; i < length - 1; ++i) { activation_->backward(frameOutput_[start + i]); frameOutput_[start + i + 1].grad->mul( - frameOutput_[start + i].grad, weightT, 1, 1); + *frameOutput_[start + i].grad, *weightT, 1, 1); } activation_->backward(frameOutput_[start + length - 1]); if (weight_->getWGrad()) { weight_->getWGrad()->mul( - output_.value->subMatrix(start + 1, length - 1)->getTranspose(), - output_.grad->subMatrix(start, length - 1), + *output_.value->subMatrix(start + 1, length - 1)->getTranspose(), + *output_.grad->subMatrix(start, length - 1), 1, 1); } @@ -329,7 +329,7 @@ void RecurrentLayer::forwardBatch(int batchSize, if (n != 0) { MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight()); - batch2->mul(batch1, weight_->getW(), 1, 1); + batch2->mul(*batch1, *weight_->getW(), 1, 1); } Argument arg; arg.value = batch2; @@ -367,14 +367,14 @@ void RecurrentLayer::backwardBatch(int batchSize, if (n != 0) { batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight()); - batch1->mul(batch2, weightT, 1, 1); + batch1->mul(*batch2, *weightT, 1, 1); } if (backwardByBatch && weight_->getWGrad()) { if (n != 0) { /* backward weight */ batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight()); - weight_->getWGrad()->mul(batch1->getTranspose(), batch2, 1, 1); + weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1); } } } @@ -389,14 +389,14 @@ void RecurrentLayer::backwardBatch(int batchSize, int len = starts[seq + 1] - starts[seq]; if (!reversed_) { weight_->getWGrad()->mul( - output_.value->subMatrix(starts[seq], len - 1)->getTranspose(), - output_.grad->subMatrix(starts[seq] + 1, len - 1), + *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(), + *output_.grad->subMatrix(starts[seq] + 1, len - 1), 1, 1); } else { weight_->getWGrad()->mul( - output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(), - output_.grad->subMatrix(starts[seq], len - 1), + *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(), + *output_.grad->subMatrix(starts[seq], len - 1), 1, 1); } diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp index 9200a01eee3be..5eacff6b71439 100644 --- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp +++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp @@ -155,20 +155,20 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) { // manully compute the multiplication of // the input vector and the selected rows. REGISTER_TIMER("selective.plain"); - interOutput_->mul(input, weight->getTranspose(), 1, scaleT); + interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT); } else { // if the indecies is not sparse enough, // use full mul instead REGISTER_TIMER("selective.mul"); if (fullOutput_) { - interOutput_->mul(input, weight->getTranspose(), 1, scaleT); + interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT); } else { Matrix::resizeOrCreate(mmat_, hsize, wsize, /*trans=*/false, /*useGpu=*/useGpu_); - mmat_->mul(input, weight->getTranspose()); + mmat_->mul(*input, *weight->getTranspose()); interOutput_->add3(mmat_); } } @@ -242,14 +242,14 @@ void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) { MatrixPtr preGrad = getInputGrad(i); if (preGrad) { REGISTER_TIMER_INFO("BpMulTimer", getName().c_str()); - preGrad->mul(interOutGrad_, weights_[i]->getW(), 1, 1); + preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1); } MatrixPtr wGrad = weights_[i]->getWGrad(); if (wGrad) { REGISTER_TIMER_INFO("GradMulTimer", getName().c_str()); MatrixPtr input = getInputValue(i); - wGrad->mul(interOutGrad_->getTranspose(), input, 1, 1); + wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1); } { diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp index 642eb1bdd31c0..5be88d7c05dae 100644 --- a/paddle/gserver/layers/TensorLayer.cpp +++ b/paddle/gserver/layers/TensorLayer.cpp @@ -77,7 +77,7 @@ void TensorLayer::forward(PassType passType) { REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str()); for (size_t i = 0; i < getSize(); ++i) { MatrixPtr weights = weights_[i]->getW(); - tmpMat->mul(input1, weights, 1, 0); + tmpMat->mul(*input1, *weights, 1, 0); outV->rowDotMul(i, *tmpMat, *input2); } } @@ -112,7 +112,7 @@ void TensorLayer::backward(const UpdateCallback& callback) { if (weights_[i]->getWGrad()) { tmpMat->rowScale(i, *input1, *oGrad); MatrixPtr input1_T = tmpMat->getTranspose(); - weights_[i]->getWGrad()->mul(input1_T, input2, 1, 1); + weights_[i]->getWGrad()->mul(*input1_T, *input2, 1, 1); } } } @@ -130,11 +130,11 @@ void TensorLayer::backward(const UpdateCallback& callback) { if (NULL != preGrad1) { /* (grad * e2) * trans(W) */ tmpMat->rowScale(i, *input2, *oGrad); MatrixPtr weights_T = weights->getTranspose(); - preGrad1->mul(tmpMat, weights_T, 1, 1); + preGrad1->mul(*tmpMat, *weights_T, 1, 1); } if (NULL != preGrad2) { /* (grad * e1) * W */ tmpMat->rowScale(i, *input1, *oGrad); - preGrad2->mul(tmpMat, weights, 1, 1); + preGrad2->mul(*tmpMat, *weights, 1, 1); } } } diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp index 3f7ff04882075..2a12499e5b5f1 100644 --- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp +++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp @@ -46,7 +46,7 @@ TransposedFullMatrixProjection::TransposedFullMatrixProjection( void TransposedFullMatrixProjection::forward() { REGISTER_TIMER_INFO("FwMulTimer", getName().c_str()); - out_->value->mul(in_->value, weight_->getW()->getTranspose(), 1, 1); + out_->value->mul(*(in_->value), *(weight_->getW()->getTranspose()), 1, 1); } void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) { @@ -55,7 +55,8 @@ void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) { /* Calculate the W-gradient for the current layer */ if (weight_->getWGrad()) { REGISTER_TIMER_INFO("GradMulTimer", getName().c_str()); - weight_->getWGrad()->mul(out_->grad->getTranspose(), in_->value, 1, 1); + weight_->getWGrad()->mul( + *(out_->grad->getTranspose()), *(in_->value), 1, 1); } // If callback does not change value, backprop error asynchronously so that @@ -69,7 +70,7 @@ void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) { /* Calculate the input layers error */ if (in_->grad) { REGISTER_TIMER_INFO("BpMulTimer", getName().c_str()); - in_->grad->mul(out_->grad, weight_->getW(), 1, 1); + in_->grad->mul(*(out_->grad), *(weight_->getW()), 1, 1); } hl_set_sync_flag(syncFlag); diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 34dc375f21a54..c26a2a7f06bc1 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -34,6 +34,14 @@ add_unittest_without_exec(test_ConvTrans add_test(NAME test_ConvTrans COMMAND test_ConvTrans) +################# test_PriorBox ####################### +add_unittest_without_exec(test_PriorBox + test_PriorBox.cpp + LayerGradUtil.cpp + TestUtil.cpp) + +add_test(NAME test_PriorBox + COMMAND test_PriorBox) ################# test_ConvUnify ####################### add_unittest_without_exec(test_ConvUnify test_ConvUnify.cpp diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index 1d5e7de1ba624..57c176810fddf 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -303,13 +303,31 @@ void initDataLayer(TestConfig testConf, ICpuGpuVectorPtr sequenceStartPositions; ICpuGpuVectorPtr subSequenceStartPositions; IVectorPtr cpuSequenceDims; - for (size_t i = 0; i < testConf.inputDefs.size(); i++) { + for (size_t i = 0; i < testConf.inputDefs.size(); ++i) { + if (testConf.inputDefs[i].inputType != INPUT_SEQUENCE_LABEL) continue; + + const std::vector& labelSeqStartPositions = + testConf.inputDefs[i].labelSeqStartPositions; + if (labelSeqStartPositions.size() != 0) { + CHECK(!sequenceStartPositions); + CHECK_GE(labelSeqStartPositions.size(), 2); + + sequenceStartPositions = + ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu); + sequenceStartPositions->copyFrom( + labelSeqStartPositions.data(), labelSeqStartPositions.size(), useGpu); + } + } + + for (size_t i = 0; i < testConf.inputDefs.size(); ++i) { LayerConfig config; config.set_name(testConf.inputDefs[i].name); config.set_type("data"); config.set_size(testConf.inputDefs[i].dim); LayerPtr layer = LayerPtr(new DataLayer(config)); - size_t numSequence = batchSize / 10 + 1; + size_t numSequence = sequenceStartPositions + ? sequenceStartPositions->getSize() - 1 + : batchSize / 10 + 1; Argument data; auto fillData = [&](bool trans, int height, int width) { @@ -336,9 +354,17 @@ void initDataLayer(TestConfig testConf, break; case INPUT_LABEL: case INPUT_SEQUENCE_LABEL: - data.ids = VectorT::create(batchSize, useGpu); - // now rand number can be 0 to inputDefs[i].dim - data.ids->rand(testConf.inputDefs[i].dim); + if (testConf.inputDefs[i].labelInitValue.size() != 0) { + const std::vector& labelInitValue = + testConf.inputDefs[i].labelInitValue; + CHECK_EQ(labelInitValue.size(), batchSize); + data.ids = VectorT::create(batchSize, useGpu); + data.ids->copyFrom(labelInitValue.data(), batchSize); + } else { + data.ids = VectorT::create(batchSize, useGpu); + // now rand number can be 0 to inputDefs[i].dim + data.ids->rand(testConf.inputDefs[i].dim); + } break; case INPUT_SPARSE_NON_VALUE_DATA: data.value = makeRandomSparseMatrix( diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index 62ac2d160fd91..4e88ac0e81ef2 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -64,6 +64,9 @@ struct InputDef { size_t paraSize; ParaSparse sparse; bool isStatic; + std::vector labelInitValue; + std::vector labelSeqStartPositions; + InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) { inputType = type; name = nameIn; @@ -72,6 +75,23 @@ struct InputDef { sparse = {""}; isStatic = false; } + + InputDef(InputType type, + string nameIn, + size_t dimIn, + size_t sizeIn, + const std::vector& labelInitValue, + const std::vector& labelSeqStartPositions) + : labelInitValue(labelInitValue), + labelSeqStartPositions(labelSeqStartPositions) { + inputType = type; + name = nameIn; + dim = dimIn; + paraSize = sizeIn; + sparse = {""}; + isStatic = false; + } + InputDef(InputType type, string nameIn, size_t dimIn, diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp index 99202c2d5702a..dd3378304b433 100644 --- a/paddle/gserver/tests/test_ConvTrans.cpp +++ b/paddle/gserver/tests/test_ConvTrans.cpp @@ -206,8 +206,8 @@ TEST(Layer, convTransLayerFwd2) { /* filter_size */ 5, result); - float resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4, - 4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1}; + real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4, + 4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1}; result->setData(resultData); doOneConvtTest(/* imgSize */ 5, /* output_x */ 2, @@ -216,8 +216,8 @@ TEST(Layer, convTransLayerFwd2) { /* filter_size */ 4, result); - float resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4, - 4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1}; + real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4, + 4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1}; result->setData(resultData2); doOneConvtTest(/* imgSize */ 5, /* output_x */ 2, @@ -226,8 +226,8 @@ TEST(Layer, convTransLayerFwd2) { /* filter_size */ 5, result); - float resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4, - 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1}; + real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4, + 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1}; result->setData(resultData3); doOneConvtTest(/* imgSize */ 5, /* output_x */ 2, diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp index 2ab18f886848d..ad99b50245cf5 100644 --- a/paddle/gserver/tests/test_ConvUnify.cpp +++ b/paddle/gserver/tests/test_ConvUnify.cpp @@ -106,8 +106,8 @@ TEST(Layer, convParaUnified) { #ifndef PADDLE_ONLY_CPU MatrixPtr input, resultCpu, resultGpu; input = Matrix::create(1, 4 * 4, false, false); - float inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; - float param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1}; + real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1}; input->setData(inputData); @@ -137,26 +137,9 @@ TEST(Layer, convParaUnified) { checkMatrixEqual(resultCpu, resultGpu); input = Matrix::create(1, 3 * 3 * 2, false, false); - float inputData2[] = {1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18}; - float param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1}; + real inputData2[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; + real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1}; input->setData(inputData2); @@ -185,7 +168,7 @@ TEST(Layer, convParaUnified) { true); checkMatrixEqual(resultCpu, resultGpu); - float param3[] = {1, 2, 3, 4, 4, 3, 2, 1}; + real param3[] = {1, 2, 3, 4, 4, 3, 2, 1}; resultCpu = doOneConvTest(/* imgSize */ 3, /* output_x */ 2, diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 8a8d094ed357a..2cc25f6b211e3 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1021,11 +1021,10 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) { testLayerGrad(config, "norm", 100, trans, useGpu); } -#ifndef PADDLE_ONLY_CPU TEST(Layer, NormLayer) { testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ true); + testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ false); } -#endif void setPoolConfig(TestConfig* config, PoolConfig* pool, diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp new file mode 100644 index 0000000000000..a6d6a24269663 --- /dev/null +++ b/paddle/gserver/tests/test_PriorBox.cpp @@ -0,0 +1,212 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "LayerGradUtil.h" +#include "TestUtil.h" + +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +// Do one forward pass of priorBox layer and check to see if its output +// matches the given result +void doOnePriorBoxTest(size_t feature_map_width, + size_t feature_map_height, + size_t image_width, + size_t image_height, + vector min_size, + vector max_size, + vector aspect_ratio, + vector variance, + bool use_gpu, + MatrixPtr& result) { + // Setting up the priorbox layer + TestConfig configt; + configt.layerConfig.set_type("priorbox"); + + configt.inputDefs.push_back({INPUT_DATA, "featureMap", 1, 0}); + LayerInputConfig* input = configt.layerConfig.add_inputs(); + configt.inputDefs.push_back({INPUT_DATA, "image", 1, 0}); + configt.layerConfig.add_inputs(); + PriorBoxConfig* pb = input->mutable_priorbox_conf(); + for (size_t i = 0; i < min_size.size(); i++) pb->add_min_size(min_size[i]); + for (size_t i = 0; i < max_size.size(); i++) pb->add_max_size(max_size[i]); + for (size_t i = 0; i < variance.size(); i++) pb->add_variance(variance[i]); + for (size_t i = 0; i < aspect_ratio.size(); i++) + pb->add_aspect_ratio(aspect_ratio[i]); + + // data layer initialize + std::vector dataLayers; + LayerMap layerMap; + vector datas; + initDataLayer( + configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu); + dataLayers[0]->getOutput().setFrameHeight(feature_map_height); + dataLayers[0]->getOutput().setFrameWidth(feature_map_width); + dataLayers[1]->getOutput().setFrameHeight(image_height); + dataLayers[1]->getOutput().setFrameWidth(image_width); + + // test layer initialize + std::vector parameters; + LayerPtr priorboxLayer; + initTestLayer(configt, &layerMap, ¶meters, &priorboxLayer); + priorboxLayer->forward(PASS_GC); + checkMatrixEqual(priorboxLayer->getOutputValue(), result); +} + +TEST(Layer, priorBoxLayerFwd) { + vector minSize; + vector maxSize; + vector aspectRatio; + vector variance; + bool useGpu = false; + + minSize.push_back(276); + maxSize.push_back(330); + variance.push_back(0.1); + variance.push_back(0.1); + variance.push_back(0.2); + variance.push_back(0.2); + + // CPU case 1. + MatrixPtr result; + real resultData[] = {0.04, + 0.04, + 0.96, + 0.96, + 0.1, + 0.1, + 0.2, + 0.2, + 0, + 0, + 1, + 1, + 0.1, + 0.1, + 0.2, + 0.2}; + result = Matrix::create(1, 2 * 8, false, useGpu); + result->setData(resultData); + doOnePriorBoxTest(/* feature_map_width */ 1, + /* feature_map_height */ 1, + /* image_width */ 300, + /* image_height */ 300, + minSize, + maxSize, + aspectRatio, + variance, + useGpu, + result); + // CPU case 2. + variance[1] = 0.2; + variance[3] = 0.1; + maxSize.pop_back(); + real resultData2[] = {0, 0, 0.595, 0.595, 0.1, 0.2, 0.2, 0.1, + 0.405, 0, 1, 0.595, 0.1, 0.2, 0.2, 0.1, + 0, 0.405, 0.595, 1, 0.1, 0.2, 0.2, 0.1, + 0.405, 0.405, 1, 1, 0.1, 0.2, 0.2, 0.1}; + Matrix::resizeOrCreate(result, 1, 4 * 8, false, useGpu); + result->setData(resultData2); + doOnePriorBoxTest(/* feature_map_width */ 2, + /* feature_map_height */ 2, + /* image_width */ 400, + /* image_height */ 400, + minSize, + maxSize, + aspectRatio, + variance, + useGpu, + result); + // CPU case 3. + aspectRatio.push_back(2); + real resultData3[] = {0.04, 0.04, 0.96, 0.96, 0.1, 0.2, + 0.2, 0.1, 0, 0.17473088, 1, 0.825269, + 0.1, 0.2, 0.2, 0.1, 0.17473088, 0, + 0.825269, 1, 0.1, 0.2, 0.2, 0.1}; + Matrix::resizeOrCreate(result, 1, 3 * 8, false, useGpu); + result->setData(resultData3); + doOnePriorBoxTest(/* feature_map_width */ 1, + /* feature_map_height */ 1, + /* image_width */ 300, + /* image_height */ 300, + minSize, + maxSize, + aspectRatio, + variance, + useGpu, + result); + +#ifndef PADDLE_ONLY_CPU + // reset the input parameters + variance[1] = 0.1; + variance[3] = 0.2; + maxSize.push_back(330); + aspectRatio.pop_back(); + MatrixPtr resultGpu; + useGpu = true; + // GPU case 1. + resultGpu = Matrix::create(1, 2 * 8, false, useGpu); + resultGpu->copyFrom(resultData, 2 * 8); + doOnePriorBoxTest(/* feature_map_width */ 1, + /* feature_map_height */ 1, + /* image_width */ 300, + /* image_height */ 300, + minSize, + maxSize, + aspectRatio, + variance, + useGpu, + resultGpu); + // GPU case 2. + variance[1] = 0.2; + variance[3] = 0.1; + maxSize.pop_back(); + Matrix::resizeOrCreate(resultGpu, 1, 4 * 8, false, useGpu); + resultGpu->copyFrom(resultData2, 4 * 8); + doOnePriorBoxTest(/* feature_map_width */ 2, + /* feature_map_height */ 2, + /* image_width */ 400, + /* image_height */ 400, + minSize, + maxSize, + aspectRatio, + variance, + useGpu, + resultGpu); + // GPU case 3. + aspectRatio.push_back(2); + Matrix::resizeOrCreate(resultGpu, 1, 3 * 8, false, useGpu); + resultGpu->copyFrom(resultData3, 3 * 8); + doOnePriorBoxTest(/* feature_map_width */ 1, + /* feature_map_height */ 1, + /* image_width */ 300, + /* image_height */ 300, + minSize, + maxSize, + aspectRatio, + variance, + useGpu, + resultGpu); +#endif +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp index b5d5b6ef61582..82a482f701481 100644 --- a/paddle/math/CpuSparseMatrix.cpp +++ b/paddle/math/CpuSparseMatrix.cpp @@ -163,15 +163,16 @@ MatrixPtr CpuSparseMatrix::getTranspose() { SparseValueType CpuSparseMatrix::getValueType() { return valueType_; } -void CpuSparseMatrix::mul(MatrixPtr a, MatrixPtr b, real scaleAB, real scaleT) { +void CpuSparseMatrix::mul(const Matrix& a, + const Matrix& b, + real scaleAB, + real scaleT) { CHECK(!isTransposed()) << "Not supported"; + const auto a_ptr = dynamic_cast(&a); + const auto b_ptr = dynamic_cast(&b); - if (dynamic_cast(a.get()) && dynamic_cast(b.get())) { - CpuMatrix::mul(dynamic_cast(a.get()), - dynamic_cast(b.get()), - this, - scaleAB, - scaleT); + if (a_ptr && b_ptr) { + CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT); } else { LOG(FATAL) << "not supported"; } diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h index 9676f8864f845..d3e8871cb5b32 100644 --- a/paddle/math/CpuSparseMatrix.h +++ b/paddle/math/CpuSparseMatrix.h @@ -203,7 +203,7 @@ class CpuSparseMatrix : public Matrix { /// mem MUST be alloced outside (memAlloc=false) void transpose(MatrixPtr matTrans, bool memAlloc); - void mul(MatrixPtr A, MatrixPtr B, real alpha, real beta); + void mul(const Matrix& A, const Matrix& B, real alpha, real beta); /** * @brief sparseMatrix += denseMatrix diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index c69e074a76399..50d2e3eb67102 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -582,18 +582,16 @@ void GpuMatrix::mul(const GpuMatrix& a, } /* this = a*b */ -void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) { - mul(a, b, 1.0, 0.0); -} +void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); } -void GpuMatrix::mul(const MatrixPtr a, - const MatrixPtr b, +void GpuMatrix::mul(const Matrix& a, + const Matrix& b, real scaleAB, real scaleT) { - GpuMatrixPtr a_ptr = std::dynamic_pointer_cast(a); - GpuMatrixPtr b_ptr = std::dynamic_pointer_cast(b); - GpuSparseMatrixPtr a_ptr_s = std::dynamic_pointer_cast(a); - GpuSparseMatrixPtr b_ptr_s = std::dynamic_pointer_cast(b); + const auto a_ptr = dynamic_cast(&a); + const auto b_ptr = dynamic_cast(&b); + const auto a_ptr_s = dynamic_cast(&a); + const auto b_ptr_s = dynamic_cast(&b); if (a_ptr && b_ptr) { mul(*a_ptr, *b_ptr, scaleAB, scaleT); @@ -766,20 +764,19 @@ void GpuMatrix::maxoutBackward(Matrix& a, } /*calulate the error of classification */ -void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) { - GpuMatrixPtr output_ptr = std::dynamic_pointer_cast(output); - GpuIVectorPtr label_ptr = std::dynamic_pointer_cast(label); - +void GpuMatrix::classificationError(Matrix& output, IVector& label) { + auto output_ptr = dynamic_cast(&output); + auto label_ptr = dynamic_cast(&label); CHECK(output_ptr && label_ptr) << "Invalid argument pointer"; CHECK(height_ == output_ptr->height_ && width_ == 1) << "Matrix dimensions are not equal"; - real* output_d = output_ptr->data_; - real* recResult_d = data_; - int* label_d = label_ptr->getData(); - hl_matrix_classification_error( - output_d, label_d, recResult_d, height_, output_ptr->width_); + hl_matrix_classification_error((real*)output_ptr->data_, + (int*)label_ptr->getData(), + data_, + height_, + output_ptr->width_); } /* copy -log(output[i * width + label]) to this->data[i] */ @@ -1265,69 +1262,6 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, outGrad.getStride()); } -void GpuMatrix::crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow) { - size_t num = input.getHeight(); - size_t height = imgSizeH; - size_t width = imgSizeW; - - CHECK(height * width * channels == input.getWidth()); - CHECK(denoms.getHeight() == input.getHeight() && - denoms.getWidth() == input.getWidth() && input.getHeight() == height_ && - input.getWidth() == width_); - hl_CMRNorm_forward(num, - input.getData(), - denoms.getData(), - data_, - channels, - height, - width, - sizeX, - scale, - -pow); -} - -void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t sizeX, - float scale, - float pow) { - size_t num = preOutV.getHeight(); - size_t height = imgSizeH; - size_t width = imgSizeW; - - CHECK(width * height * channels == preOutV.getWidth()); - CHECK(denoms.getHeight() == preOutV.getHeight() && - denoms.getWidth() == preOutV.getWidth() && - preOutV.getHeight() == height_ && preOutV.getWidth() == width_); - CHECK(denoms.getHeight() == localGrad.getHeight() && - denoms.getWidth() == localGrad.getWidth()); - - hl_CMRNorm_backward(num, - preOutV.getData(), - denoms.getData(), - localOutV.getData(), - localGrad.getData(), - data_, - channels, - height, - width, - sizeX, - -pow, - 2.0f * pow * scale); -} - void GpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence, IVector& index) { @@ -1370,86 +1304,62 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad, hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim); } -void GpuMatrix::contextProjectionForward(MatrixPtr input, - MatrixPtr weight, +void GpuMatrix::contextProjectionForward(Matrix& input, + Matrix* weight, const IVector& sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding) { - CHECK(dynamic_cast(input.get())); + CHECK(dynamic_cast(&input)); CHECK(dynamic_cast(&sequence)); - if (weight) CHECK(dynamic_cast(weight.get())); - - size_t numSequences = sequence.getSize() - 1; - int64_t inputDim = input->getWidth(); - int64_t dim = getWidth(); - CHECK_EQ(dim, inputDim * contextLength); - - real* outData = getData(); - real* inputData = input->getData(); - const int* starts = sequence.getData(); + if (weight) CHECK(dynamic_cast(weight)); + CHECK_EQ(getWidth(), input.getWidth() * contextLength); - hl_context_projection_forward(inputData, - starts, + hl_context_projection_forward(input.getData(), + sequence.getData(), isPadding ? weight->getData() : NULL, - outData, - numSequences, - inputDim, + getData(), + sequence.getSize() - 1, + input.getWidth(), contextLength, contextStart, beginPad, isPadding); } -void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad, +void GpuMatrix::contextProjectionBackwardData(Matrix& inputGrad, const IVector& sequence, int contextLength, int contextStart) { - CHECK(dynamic_cast(inputGrad.get())); + CHECK(dynamic_cast(&inputGrad)); CHECK(dynamic_cast(&sequence)); + CHECK_EQ(getWidth(), inputGrad.getWidth() * contextLength); - size_t numSequences = sequence.getSize() - 1; - int64_t inputDim = inputGrad->getWidth(); - int64_t dim = getWidth(); - CHECK_EQ(dim, inputDim * contextLength); - - real* outGrad = getData(); - real* inGrad = inputGrad->getData(); - const int* starts = sequence.getData(); - - hl_context_projection_backward_data(outGrad, - starts, - inGrad, - numSequences, - inputDim, + hl_context_projection_backward_data(getData(), + sequence.getData(), + inputGrad.getData(), + sequence.getSize() - 1, + inputGrad.getWidth(), contextLength, contextStart); } -void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad, +void GpuMatrix::contextProjectionBackwardWeight(Matrix& weightGrad, const IVector& sequence, int contextLength, int contextStart, int totalPad, size_t beginPad) { - CHECK(dynamic_cast(weightGrad.get())); + CHECK(dynamic_cast(&weightGrad)); CHECK(dynamic_cast(&sequence)); + CHECK_EQ(getWidth(), weightGrad.getWidth() * contextLength); - size_t numSequences = sequence.getSize() - 1; - int64_t weightDim = weightGrad->getWidth(); - int64_t dim = getWidth(); - CHECK_EQ(dim, weightDim * contextLength); - - real* outGrad = getData(); - real* wtGrad = weightGrad->getData(); - const int* starts = sequence.getData(); - - hl_context_projection_backward_weight(outGrad, - starts, - wtGrad, - numSequences, - weightDim, + hl_context_projection_backward_weight(getData(), + sequence.getData(), + weightGrad.getData(), + sequence.getSize() - 1, + weightGrad.getWidth(), totalPad, contextLength, contextStart, @@ -2219,84 +2129,6 @@ void CpuMatrix::avgPoolBackward(Matrix& input, } } -void CpuMatrix::crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow) { - size_t num = input.getHeight(); - size_t height = imgSizeH; - size_t width = imgSizeW; - size_t numCols = input.getWidth(); - CHECK(height * width * channels == input.getWidth()); - CHECK(denoms.getHeight() == input.getHeight() && - denoms.getWidth() == input.getWidth() && input.getHeight() == height_ && - input.getWidth() == width_); - real* imgData = input.getData(); - real* diffData = input.getData(); - real* targetData = getData(); - size_t halfSize = sizeX / 2; - size_t imgPixels = height * width; - - // use integral vector to implement the sum in local window - real* integralData = - (real*)malloc((channels + sizeX + 1) * sizeof(real)); // NOLINT // TODO: - for (size_t i = 0; i <= halfSize; i++) { - integralData[i] = 0; - } - for (size_t i = 0; i < num; i++) { - real* targetPtr = targetData + i * numCols; - real* imgPtr = imgData + i * numCols; - real* diffPtr = diffData + i * numCols; - for (size_t m = 0; m < height; m++) { - for (size_t n = 0; n < width; n++) { - for (size_t c = 0; c < channels; c++) { - integralData[c + halfSize + 1] = - integralData[c + halfSize] + _square(*(diffPtr + c * imgPixels)); - } - for (size_t k = channels + halfSize + 1; k <= channels + sizeX; k++) { - integralData[k] = integralData[channels + halfSize]; - } - for (size_t k = 0; k < channels; k += 1) { - real a = integralData[k + sizeX] - integralData[k]; - a = scale * a + 1; - targetPtr[k * imgPixels] = imgPtr[k * imgPixels] * _pow(a, -pow); - } - diffPtr++; - targetPtr++; - imgPtr++; - } - } - } - free(integralData); - integralData = NULL; -} - -void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t size, - float scale, - float pow) { - LOG(FATAL) << "Not implemented"; - - CHECK(imgSizeH * imgSizeW * channels == preOutV.getWidth()); - CHECK(denoms.getHeight() == preOutV.getHeight() && - denoms.getWidth() == preOutV.getWidth() && - preOutV.getHeight() == height_ && preOutV.getWidth() == width_); - CHECK(denoms.getHeight() == localGrad.getHeight() && - denoms.getWidth() == localGrad.getWidth()); - - // NOLINT // TODO: -} - /** * Input: one or more sequences. Each sequence contains some instances. * Output: output size is the number of input sequences (NOT input instances). @@ -2371,23 +2203,21 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad, } } -void CpuMatrix::contextProjectionForward(MatrixPtr input, - MatrixPtr weight, +void CpuMatrix::contextProjectionForward(Matrix& input, + Matrix* weight, const IVector& sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding) { - CHECK(dynamic_cast(input.get())); - CHECK(dynamic_cast(&sequence)); - if (weight) CHECK(dynamic_cast(weight.get())); - - size_t numSequences = sequence.getSize() - 1; - int64_t inputDim = input->getWidth(); - int64_t dim = getWidth(); - CHECK_EQ(dim, inputDim * contextLength); - const int* starts = sequence.getData(); - + auto input_ptr = dynamic_cast(&input); + auto seq_ptr = dynamic_cast(&sequence); + CHECK(input_ptr && seq_ptr); + if (weight) CHECK(dynamic_cast(weight)); + CHECK_EQ(getWidth(), input_ptr->getWidth() * contextLength); + + const int* starts = seq_ptr->getData(); + size_t numSequences = seq_ptr->getSize() - 1; for (size_t i = 0; i < numSequences; ++i) { for (int j = 0; j < contextLength; ++j) { int begin = starts[i] + contextStart + j; @@ -2400,7 +2230,7 @@ void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr mat = this->subMatrix(starts[i], padSize); if (isPadding) { MatrixPtr sub = weight->subMatrix(j, padSize); - mat->addAtOffset(*sub, j * inputDim); + mat->addAtOffset(*sub, j * input_ptr->getWidth()); } dstBegin = starts[i] + padSize; begin = starts[i]; @@ -2412,41 +2242,36 @@ void CpuMatrix::contextProjectionForward(MatrixPtr input, if (isPadding) { MatrixPtr sub = weight->subMatrix(beginPad + contextStart + j - padSize, padSize); - mat->addAtOffset(*sub, j * inputDim); + mat->addAtOffset(*sub, j * input_ptr->getWidth()); } dstEnd = starts[i + 1] - padSize; end = starts[i + 1]; } if (end <= begin) continue; - MatrixPtr src = input->subMatrix(begin, end - begin); + MatrixPtr src = input_ptr->subMatrix(begin, end - begin); MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin); - dst->addAtOffset(*src, j * inputDim); + dst->addAtOffset(*src, j * input_ptr->getWidth()); } } } -void CpuMatrix::contextProjectionBackward(MatrixPtr inputGrad, - MatrixPtr weightGrad, +void CpuMatrix::contextProjectionBackward(Matrix* inputGrad, + Matrix* weightGrad, const IVector& sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding) { - if (inputGrad) CHECK(dynamic_cast(inputGrad.get())); - if (weightGrad) CHECK(dynamic_cast(weightGrad.get())); + if (inputGrad) CHECK(dynamic_cast(inputGrad)); + if (weightGrad) CHECK(dynamic_cast(weightGrad)); CHECK(dynamic_cast(&sequence)); - int64_t inputDim = 0; - int64_t dim = getWidth(); - size_t numSequences = sequence.getSize() - 1; - const int* starts = sequence.getData(); - if (inputGrad) { - inputDim = inputGrad->getWidth(); - } else { - inputDim = weightGrad->getWidth(); - } - CHECK_EQ(dim, inputDim * contextLength); + int64_t inputDim = inputGrad ? inputGrad->getWidth() + : weightGrad ? weightGrad->getWidth() : 0; + CHECK_EQ(getWidth(), inputDim * contextLength); + const int* starts = sequence.getData(); + size_t numSequences = sequence.getSize() - 1; for (size_t i = 0; i < numSequences; ++i) { for (int j = 0; j < contextLength; ++j) { int begin = starts[i] + contextStart + j; @@ -2630,29 +2455,22 @@ void CpuMatrix::sequenceAvgForward(Matrix& a, } /* this = scaleAB*(a*b) + scaleT*this*/ -void CpuMatrix::mul(const MatrixPtr a, - const MatrixPtr b, +void CpuMatrix::mul(const Matrix& a, + const Matrix& b, real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; + const auto a_ptr = dynamic_cast(&a); + const auto b_ptr = dynamic_cast(&b); + const auto a_ptr_s = dynamic_cast(&a); + const auto b_ptr_s = dynamic_cast(&b); - if (dynamic_cast(a.get()) && dynamic_cast(b.get())) { - mul(dynamic_cast(a.get()), - dynamic_cast(b.get()), - scaleAB, - scaleT); - } else if (dynamic_cast(a.get()) && - dynamic_cast(b.get())) { - mul(dynamic_cast(a.get()), - dynamic_cast(b.get()), - scaleAB, - scaleT); - } else if (dynamic_cast(a.get()) && - dynamic_cast(b.get())) { - mul(dynamic_cast(a.get()), - dynamic_cast(b.get()), - scaleAB, - scaleT); + if (a_ptr && b_ptr) { + mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT); + } else if (a_ptr_s && b_ptr) { + mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT); + } else if (a_ptr && b_ptr_s) { + mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT); } else { LOG(FATAL) << "Not supported"; } @@ -3321,7 +3139,7 @@ void CpuMatrix::addColumnVector(const Matrix& b) { } /* this = a*b */ -void CpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) { +void CpuMatrix::mul(const Matrix& a, const Matrix& b) { return mul(a, b, 1.0, 0.0); } @@ -3544,21 +3362,20 @@ void CpuMatrix::rowNormalizeL1(Matrix& out) { } /* calulate classification error */ -void CpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) { - CHECK(dynamic_cast(output.get())); - CHECK(dynamic_cast(label.get())); +void CpuMatrix::classificationError(Matrix& output, IVector& label) { + CHECK(dynamic_cast(&output)); + CHECK(dynamic_cast(&label)); - size_t numSamples = getHeight(); - size_t dim = output->getWidth(); - CHECK_EQ(label->getSize(), numSamples); - CHECK_EQ(output->getHeight(), numSamples); CHECK_EQ(getWidth(), (size_t)1); + size_t numSamples = getHeight(); + CHECK_EQ(label.getSize(), numSamples); + CHECK_EQ(output.getHeight(), numSamples); - real* out = output->getData(); - real* result = getData(); - int* lbl = label->getData(); - real maxData; - int maxIndex; + size_t dim = output.getWidth(); + real* out = output.getData(); + int* lbl = label.getData(); + real maxData = 0.0; + int maxIndex = -1; for (size_t i = 0; i < numSamples; ++i) { CHECK_GE(lbl[i], 0); CHECK_LT((size_t)lbl[i], dim); @@ -3570,7 +3387,7 @@ void CpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) { maxData = out[i * dim + j]; } } - result[i] = (maxIndex != lbl[i]); + getData()[i] = (maxIndex != lbl[i]); } } diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 1cfb90a9dbf19..25ce09e346694 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -444,8 +444,8 @@ class Matrix : public BaseMatrix { * this = scaleAB*(a*b) + scaleT*this * @endcode */ - virtual void mul(const MatrixPtr a, - const MatrixPtr b, + virtual void mul(const Matrix& a, + const Matrix& b, real scaleAB, real scaleT) { LOG(FATAL) << "Not implemented"; @@ -643,7 +643,7 @@ class Matrix : public BaseMatrix { * this = a*b * @endcode */ - virtual void mul(const MatrixPtr a, const MatrixPtr b) { + virtual void mul(const Matrix& a, const Matrix& b) { LOG(FATAL) << "Not implemented"; } @@ -835,7 +835,7 @@ class Matrix : public BaseMatrix { * * output[i] = 0 if row i is correct. */ - virtual void classificationError(MatrixPtr output, IVectorPtr label) { + virtual void classificationError(Matrix& output, IVector& label) { LOG(FATAL) << "Not implemented"; } @@ -952,31 +952,6 @@ class Matrix : public BaseMatrix { LOG(FATAL) << "Not implemeted"; } - /// normalize-operation. - virtual void crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow) { - LOG(FATAL) << "Not implemeted"; - } - - virtual void crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t size, - float scale, - float pow) { - LOG(FATAL) << "Not implemeted"; - } - /** * Input: one or more sequences. Each sequence contains some instances. * @@ -997,8 +972,8 @@ class Matrix : public BaseMatrix { LOG(FATAL) << "Not implemeted"; } - virtual void contextProjectionForward(MatrixPtr input, - MatrixPtr weight, + virtual void contextProjectionForward(Matrix& input, + Matrix* weight, const IVector& sequence, int contextLength, int contextStart, @@ -1007,8 +982,8 @@ class Matrix : public BaseMatrix { LOG(FATAL) << "Not implemeted"; } - virtual void contextProjectionBackward(MatrixPtr inputGrad, - MatrixPtr weightGrad, + virtual void contextProjectionBackward(Matrix* inputGrad, + Matrix* weightGrad, const IVector& sequence, int contextLength, int contextStart, @@ -1017,14 +992,14 @@ class Matrix : public BaseMatrix { LOG(FATAL) << "Not implemeted"; } - virtual void contextProjectionBackwardData(MatrixPtr inputGrad, + virtual void contextProjectionBackwardData(Matrix& inputGrad, const IVector& sequence, int contextLength, int contextStart) { LOG(FATAL) << "Not implemeted"; } - virtual void contextProjectionBackwardWeight(MatrixPtr weightGrad, + virtual void contextProjectionBackwardWeight(Matrix& weightGrad, const IVector& sequence, int contextLength, int contextStart, @@ -1272,14 +1247,14 @@ class GpuMatrix : public Matrix { * this = scaleAB*(a*b) + scaleT*this * @endcode */ - void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT); + void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT); /** * @code * this = a*b * @endcode */ - void mul(const MatrixPtr a, const MatrixPtr b); + void mul(const Matrix& a, const Matrix& b); void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT); @@ -1373,7 +1348,7 @@ class GpuMatrix : public Matrix { void check(std::ostream& os, Matrix& refMat, bool printDiff = true); void randomizeUniform(); - void classificationError(MatrixPtr output, IVectorPtr label); + void classificationError(Matrix& output, IVector& label); void convExpand(Matrix& feature, int feaImgHeight, @@ -1459,26 +1434,6 @@ class GpuMatrix : public Matrix { size_t paddingH, size_t paddingW); - void crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow); - - void crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t sizeX, - float scale, - float pow); - void maxSequenceForward(Matrix& input, const IVector& sequence, IVector& index); @@ -1487,20 +1442,20 @@ class GpuMatrix : public Matrix { const IVector& sequence, IVector& index); - void contextProjectionForward(MatrixPtr input, - MatrixPtr weight, + void contextProjectionForward(Matrix& input, + Matrix* weight, const IVector& sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding); - void contextProjectionBackwardData(MatrixPtr inputGrad, + void contextProjectionBackwardData(Matrix& inputGrad, const IVector& sequence, int contextLength, int contextStart); - void contextProjectionBackwardWeight(MatrixPtr weightGrad, + void contextProjectionBackwardWeight(Matrix& weightGrad, const IVector& sequence, int contextLength, int contextStart, @@ -1685,26 +1640,6 @@ class CpuMatrix : public Matrix { size_t paddingH, size_t paddingW); - void crossMapNormalFwd(Matrix& input, - size_t imgSizeH, - size_t imgSizeW, - Matrix& denoms, - size_t channels, - size_t sizeX, - float scale, - float pow); - - void crossMapNormalBwd(Matrix& localGrad, - Matrix& denoms, - Matrix& preOutV, - Matrix& localOutV, - size_t channels, - size_t imgSizeH, - size_t imgSizeW, - size_t sizeX, - float scale, - float pow); - void maxSequenceForward(Matrix& input, const IVector& sequence, IVector& index); @@ -1713,16 +1648,16 @@ class CpuMatrix : public Matrix { const IVector& sequence, IVector& index); - void contextProjectionForward(MatrixPtr input, - MatrixPtr weight, + void contextProjectionForward(Matrix& input, + Matrix* weight, const IVector& sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding); - void contextProjectionBackward(MatrixPtr inputGrad, - MatrixPtr weightGrad, + void contextProjectionBackward(Matrix* inputGrad, + Matrix* weightGrad, const IVector& sequence, int contextLength, int contextStart, @@ -1784,7 +1719,7 @@ class CpuMatrix : public Matrix { void addColumnVector(const Matrix& b); - void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT); + void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT); void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT); void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT); @@ -1807,7 +1742,7 @@ class CpuMatrix : public Matrix { virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT); - void mul(const MatrixPtr a, const MatrixPtr b); + void mul(const Matrix& a, const Matrix& b); void rightMul(Matrix& b, real scaleAB, real scaleT); void rightMul(Matrix& b); @@ -1881,7 +1816,7 @@ class CpuMatrix : public Matrix { void randomizeUniform(); - void classificationError(MatrixPtr output, IVectorPtr label); + void classificationError(Matrix& output, IVector& label); void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec); diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp index 9154503c2132a..720a035ecbd26 100644 --- a/paddle/math/SparseMatrix.cpp +++ b/paddle/math/SparseMatrix.cpp @@ -571,49 +571,48 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) { hl_stream_synchronize(stream); } -void GpuSparseMatrix::mul(const GpuMatrixPtr a, - const GpuMatrixPtr b, +void GpuSparseMatrix::mul(const GpuMatrix& a, + const GpuMatrix& b, real scaleAB, real scaleT) { - CHECK(a->useGpu_ && b->useGpu_) << "type not match"; + CHECK(a.useGpu_ && b.useGpu_) << "type not match"; CHECK(!trans_) << "trans not supported"; - real* A_d = a->getData(); - real* B_d = b->getData(); + real* A_d = (real*)a.getData(); + real* B_d = (real*)b.getData(); hl_sparse_matrix_s C_d = sMatrix_.get(); - hl_trans_op_t a_trans = a->trans_ ? HPPL_OP_T : HPPL_OP_N; - hl_trans_op_t b_trans = b->trans_ ? HPPL_OP_T : HPPL_OP_N; - - if (!a->trans_ && !b->trans_) { - CHECK(height_ == a->getHeight()); - CHECK(width_ == b->getWidth()); - CHECK(a->getWidth() == b->getHeight()); - } else if (a->trans_ && !b->trans_) { - CHECK(height_ == a->getWidth()); - CHECK(width_ == b->getWidth()); - CHECK(a->getHeight() == b->getHeight()); - } else if (!a->trans_ && b->trans_) { - CHECK(height_ == a->getHeight()); - CHECK(width_ == b->getHeight()); - CHECK(a->getWidth() == b->getWidth()); + hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N; + hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N; + + if (!a.trans_ && !b.trans_) { + CHECK(height_ == a.getHeight()); + CHECK(width_ == b.getWidth()); + CHECK(a.getWidth() == b.getHeight()); + } else if (a.trans_ && !b.trans_) { + CHECK(height_ == a.getWidth()); + CHECK(width_ == b.getWidth()); + CHECK(a.getHeight() == b.getHeight()); + } else if (!a.trans_ && b.trans_) { + CHECK(height_ == a.getHeight()); + CHECK(width_ == b.getHeight()); + CHECK(a.getWidth() == b.getWidth()); } else { LOG(INFO) << "Not support"; } int dimM = height_; int dimN = width_; - int dimK = !b->trans_ ? b->getHeight() : b->getWidth(); + int dimK = !b.trans_ ? b.getHeight() : b.getWidth(); hl_sparse_matrix_mul( A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT); } -void GpuSparseMatrix::mul(const MatrixPtr a, - const MatrixPtr b, +void GpuSparseMatrix::mul(const Matrix& a, + const Matrix& b, real scaleAB, real scaleT) { - if (std::dynamic_pointer_cast(a) && - std::dynamic_pointer_cast(b)) { - GpuMatrixPtr a_ptr = std::dynamic_pointer_cast(a); - GpuMatrixPtr b_ptr = std::dynamic_pointer_cast(b); - mul(a_ptr, b_ptr, scaleAB, scaleT); + const auto a_ptr = dynamic_cast(&a); + const auto b_ptr = dynamic_cast(&b); + if (a_ptr && b_ptr) { + mul(*a_ptr, *b_ptr, scaleAB, scaleT); } else { LOG(FATAL) << "not supported"; } diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h index bd96a3301ded2..1d3801548e03a 100644 --- a/paddle/math/SparseMatrix.h +++ b/paddle/math/SparseMatrix.h @@ -104,10 +104,7 @@ class GpuSparseMatrix : public Matrix { size_t newNnz, SparseValueType valueType); - void mul(const GpuMatrixPtr a, - const GpuMatrixPtr b, - real scaleAB, - real scaleT); + void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT); /// B = A , B.trans = !A.trans MatrixPtr getTranspose(); @@ -218,7 +215,7 @@ class GpuSparseMatrix : public Matrix { void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row); public: - void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT); + void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT); void copyFrom(CpuSparseMatrix& src, hl_stream_t stream); void copyFrom(GpuSparseMatrix& src, hl_stream_t stream); diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp index 88b75b6d83612..0949ab7ffba42 100644 --- a/paddle/math/tests/test_SparseMatrix.cpp +++ b/paddle/math/tests/test_SparseMatrix.cpp @@ -33,8 +33,8 @@ TEST(Matrix, CopyCpuMatrixToSparseMatrix) { ret2(new CpuMatrix(HEIGHT, WIDTH_TEST)); ret1->zeroMem(); ret2->zeroMem(); - ret1->mul(testMatrix, mulCpuMatrix, 1.0, 1.0); - ret2->mul(testCpuMatrix, mulCpuMatrix, 1.0, 1.0); + ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0); + ret2->mul(*testCpuMatrix, *mulCpuMatrix, 1.0, 1.0); checkMatrixEqual(ret1, ret2); } @@ -147,9 +147,9 @@ void test_sparse_matrix_mul(MatrixPara paraA, hl_stream_synchronize(stream); /*matrix mul*/ - cpuMatrixC->mul(cpuMatrixA, cpuMatrixB, 1.0, 1.0); - gpuMatrixC->mul(gpuMatrixA, gpuMatrixB, 1.0, 1.0); - cpuDenseC->mul(cpuDenseA, cpuDenseB, 1.0, 1.0); + cpuMatrixC->mul(*cpuMatrixA, *cpuMatrixB, 1.0, 1.0); + gpuMatrixC->mul(*gpuMatrixA, *gpuMatrixB, 1.0, 1.0); + cpuDenseC->mul(*cpuDenseA, *cpuDenseB, 1.0, 1.0); gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream); hl_stream_synchronize(stream); @@ -224,8 +224,8 @@ TEST(Matrix, CopySparseMatrixToGpuSparseMatrix) { MatrixPtr ret2(new GpuMatrix(HEIGHT, WIDTH_TEST)); ret1->zeroMem(); ret2->zeroMem(); - ret1->mul(testMatrix, mulCpuMatrix, 1.0, 1.0); - ret2->mul(testGpuMatrix, mulGpuMatrix, 1.0, 1.0); + ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0); + ret2->mul(*testGpuMatrix, *mulGpuMatrix, 1.0, 1.0); checkMatrixEqual(ret1, ret2); } diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index 62de5b25e4cc8..c6fc849ba0328 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -65,16 +65,16 @@ void testMatrixProjectionForward(int contextStart, // calculate int beginPad = std::max(0, -contextStart); - cpuOutput->contextProjectionForward(cpuInput, - cpuWeight, + cpuOutput->contextProjectionForward(*cpuInput, + cpuWeight.get(), *cpuSequence, contextLength, contextStart, beginPad, padding); - gpuOutput->contextProjectionForward(gpuInput, - gpuWeight, + gpuOutput->contextProjectionForward(*gpuInput, + gpuWeight.get(), *gpuSequence, contextLength, contextStart, @@ -120,17 +120,17 @@ void testMatrixProjectionBackward(int contextStart, // calculate int beginPad = std::max(0, -contextStart); - cpuOutputGrad->contextProjectionBackward(cpuInputGrad, - cpuWeightGrad, + cpuOutputGrad->contextProjectionBackward(cpuInputGrad.get(), + cpuWeightGrad.get(), *cpuSequence, contextLength, contextStart, beginPad, padding); gpuOutputGrad->contextProjectionBackwardData( - gpuInputGrad, *gpuSequence, contextLength, contextStart); + *gpuInputGrad, *gpuSequence, contextLength, contextStart); if (padding) { - gpuOutputGrad->contextProjectionBackwardWeight(gpuWeightGrad, + gpuOutputGrad->contextProjectionBackwardWeight(*gpuWeightGrad, *gpuSequence, contextLength, contextStart, @@ -318,7 +318,7 @@ void testMatrixInverse(int height) { cpu->randomizeUniform(); MatrixPtr cpuT = cpu->getTranspose(); MatrixPtr outputCheck = std::make_shared(height, height); - outputCheck->mul(cpu, cpuT); + outputCheck->mul(*cpu, *cpuT); cpu->setDiag(1.0); cpu->add(*outputCheck); @@ -328,7 +328,7 @@ void testMatrixInverse(int height) { TensorCheckErr(*cpuI, *gpuI); - outputCheck->mul(cpu, cpuI); + outputCheck->mul(*cpu, *cpuI); cpu->setDiag(1.0); TensorCheckErr(*cpu, *outputCheck); } @@ -509,8 +509,8 @@ void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) { gpuB->copyFrom(*cpuB); gpuC->copyFrom(*cpuC); - cpuC->mul(cpuA, cpuB, alpha, beta); - gpuC->mul(gpuA, gpuB, alpha, beta); + cpuC->mul(*cpuA, *cpuB, alpha, beta); + gpuC->mul(*gpuA, *gpuB, alpha, beta); TensorCheckErr(*cpuC, *gpuC); } @@ -581,8 +581,8 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) { MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN); MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN); - subCpuC->mul(subCpuA, subCpuB, alpha, beta); - subGpuC->mul(subGpuA, subGpuB, alpha, beta); + subCpuC->mul(*subCpuA, *subCpuB, alpha, beta); + subGpuC->mul(*subGpuA, *subGpuB, alpha, beta); TensorCheckErr(*cpuC, *gpuC); } @@ -939,8 +939,8 @@ void testClassificationError(int numSamples, int dim) { gpuOutput->copyFrom(*cpuOutput); gpuLabel->copyFrom(*cpuLabel); - cpuError->classificationError(cpuOutput, cpuLabel); - gpuError->classificationError(gpuOutput, gpuLabel); + cpuError->classificationError(*cpuOutput, *cpuLabel); + gpuError->classificationError(*gpuOutput, *gpuLabel); TensorCheckEqual(*cpuError, *gpuError); } diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp index 6f6de238bacaa..dcdbccffc3a19 100644 --- a/paddle/math/tests/test_sparseMatrixCompare.cpp +++ b/paddle/math/tests/test_sparseMatrixCompare.cpp @@ -102,8 +102,8 @@ void testSpMatrixMul(int M, int N, int K, real rate) { gpuC->copyFrom(*cpuC, stream); hl_stream_synchronize(stream); - cpuC->mul(cpuA, cpuB->getTranspose(), 1, 1); - gpuC->mul(gpuA, gpuB->getTranspose(), 1, 1); + cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1); + gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1); MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz)); outputCheck->copyFrom(*gpuC, stream); diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile index 207f97c4a69e6..f26055d0d4c99 100644 --- a/paddle/scripts/docker/Dockerfile +++ b/paddle/scripts/docker/Dockerfile @@ -2,6 +2,8 @@ FROM ubuntu:14.04 MAINTAINER PaddlePaddle Authors ARG DEBIAN_FRONTEND=noninteractive +ARG UBUNTU_MIRROR +RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' RUN apt-get update \ && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \ libgoogle-glog-dev libgflags-dev libgtest-dev \ diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu index 33f6adfea2a60..d13b97714727a 100644 --- a/paddle/scripts/docker/Dockerfile.gpu +++ b/paddle/scripts/docker/Dockerfile.gpu @@ -2,6 +2,8 @@ FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 MAINTAINER PaddlePaddle Authors ARG DEBIAN_FRONTEND=noninteractive +ARG UBUNTU_MIRROR +RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' RUN apt-get update \ && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \ libgoogle-glog-dev libgflags-dev libgtest-dev \ diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh index 0bbb76a8a3caa..8690fe1d40c93 100755 --- a/paddle/scripts/travis/docs.sh +++ b/paddle/scripts/travis/docs.sh @@ -7,6 +7,10 @@ source ./common.sh cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON make paddle_docs paddle_docs_cn +# check websites for broken links +linkchecker doc/en/html/index.html +linkchecker doc/cn/html/index.html + # Parse Github URL REPO=`git config remote.origin.url` SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:} @@ -35,8 +39,8 @@ git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH # remove old docs. mv new docs. rm -rf doc doc_cn -mv ../doc_cn/html doc_cn -mv ../doc/html doc +mv ../doc/cn/html doc_cn +mv ../doc/en/html doc # Check is there anything changed. set +e diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp index 44acee249554e..c7194d3bf1271 100644 --- a/paddle/utils/Stat.cpp +++ b/paddle/utils/Stat.cpp @@ -137,6 +137,9 @@ void StatSet::printSegTimerStatus() { void StatSet::printBarrierTimerStatus() { ReadLockGuard guard(lock_); + if (barrierStatSet_.empty()) { + return; + } // control barrierAbstact in runtime, so enable compliation LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') << "======= BarrierStatSet status ======" << std::endl; diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h index 9be79e8859a3b..d9cc6e413a741 100644 --- a/paddle/utils/Stat.h +++ b/paddle/utils/Stat.h @@ -258,28 +258,41 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1, // The default arguments are shown in the following line: // REGISTER_TIMER(statName, threshold = -1, statSet = globalStat) // TODO(yuyang18,wangyanfei01): if UNIQUE_NAME is needed -#define REGISTER_TIMER(statName, ...) \ - static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \ - TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__)); +#define REGISTER_TIMER(statName, ...) \ + static ::paddle::StatPtr __stat = \ + ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \ + ::paddle::TimerOnce __timerOnce( \ + __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__)); #define REGISTER_TIMER_SET(statName, start, ...) \ - static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \ - TimerOnce __timerOnce( \ - __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start); + static ::paddle::StatPtr __stat = \ + ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \ + ::paddle::TimerOnce __timerOnce(__stat.get(), \ + "", \ + ::paddle::registerTimerArg1(__VA_ARGS__), \ + false, \ + start); // dynmaic timer, support to discriminate runtime entity, used in pserver -#define REGISTER_TIMER_DYNAMIC(statName, ...) \ - StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \ - TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__)); - -#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...) \ - StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \ - TimerOnce __timerOnce( \ - __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start); - -#define REGISTER_TIMER_INFO(statName, info) \ - static StatPtr __stat = globalStat.getStat(statName); \ - TimerOnce __timerOnce(__stat.get(), info, 10 * 1000000LU /*threshold*/); +#define REGISTER_TIMER_DYNAMIC(statName, ...) \ + ::paddle::StatPtr __stat = \ + ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \ + ::paddle::TimerOnce __timerOnce( \ + __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__)); + +#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...) \ + ::paddle::StatPtr __stat = \ + ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \ + ::paddle::TimerOnce __timerOnce(__stat.get(), \ + "", \ + ::paddle::registerTimerArg1(__VA_ARGS__), \ + false, \ + start); + +#define REGISTER_TIMER_INFO(statName, info) \ + static ::paddle::StatPtr __stat = ::paddle::globalStat.getStat(statName); \ + ::paddle::TimerOnce __timerOnce( \ + __stat.get(), info, 10 * 1000000LU /*threshold*/); #endif // DISABLE_TIMER diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index be4d0041f91cf..3a9d339976fff 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -248,6 +248,13 @@ message ImageConfig { optional uint32 img_size_y = 9; } +message PriorBoxConfig { + repeated uint32 min_size = 1; + repeated uint32 max_size = 2; + repeated float aspect_ratio = 3; + repeated float variance = 4; +} + message LayerInputConfig { required string input_layer_name = 1; optional string input_parameter_name = 2; @@ -263,6 +270,7 @@ message LayerInputConfig { optional BilinearInterpConfig bilinear_interp_conf = 10; optional MaxOutConfig maxout_conf = 11; optional SppConfig spp_conf = 12; + optional PriorBoxConfig priorbox_conf = 13; } message LayerConfig { diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py index 2da592bd9d469..bd24c68b6fe88 100644 --- a/python/paddle/trainer/PyDataProvider2.py +++ b/python/paddle/trainer/PyDataProvider2.py @@ -278,7 +278,7 @@ def process(settings, file_name): custom calculate one sample's batch_size. It is very danger to set it to false and use - calc_batch_size together. Default is false. + calc_batch_size together. Default is true. :type can_over_batch_size: bool :param calc_batch_size: a method to calculate each sample's batch size. diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 39892d0533aab..2eb7b17a0b40e 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -498,9 +498,16 @@ def __init__( is_static=None, is_shared=None, update_hooks=None, - input_layer_argument=None, ): + input_layer_argument=None, + make_layer_name_in_submodel=True, ): + """ + @param make_layer_name_in_submodel True by defalut, you might need to + set it carefully when adding Input in config_parser.py. + """ self.add_keys(locals()) - self.input_layer_name = MakeLayerNameInSubmodel(input_layer_name) + self.input_layer_name = MakeLayerNameInSubmodel( + input_layer_name + ) if make_layer_name_in_submodel else input_layer_name # Define a projection for iexed layer @@ -1582,6 +1589,27 @@ def __init__(self, name, inputs): super(PrintLayer, self).__init__(name, 'print', 0, inputs) +@config_layer('priorbox') +class PriorBoxLayer(LayerBase): + def __init__(self, name, inputs, size, min_size, max_size, aspect_ratio, + variance): + super(PriorBoxLayer, self).__init__(name, 'priorbox', 0, inputs) + config_assert(len(inputs) == 2, 'PriorBoxLayer must have 2 inputs') + input_layer = self.get_input_layer(1) + config_assert( + input_layer.type == 'data', + 'Expecting the second input layer of an priorbox layer to be ' + 'a data layer') + config_assert(input_layer.width > 0, 'The data layer must set width') + config_assert(input_layer.height > 0, 'The data layer must set height') + config_assert(len(variance) == 4, 'The variance must have 4 inputs') + self.config.inputs[0].priorbox_conf.min_size.extend(min_size) + self.config.inputs[0].priorbox_conf.max_size.extend(max_size) + self.config.inputs[0].priorbox_conf.aspect_ratio.extend(aspect_ratio) + self.config.inputs[0].priorbox_conf.variance.extend(variance) + self.config.size = size + + @config_layer('data') class DataLayer(LayerBase): def __init__(self, name, size, height=None, width=None, device=None): @@ -1848,7 +1876,8 @@ def __init__(self, initial_std=0.0, initial_mean=0.0, is_static=True, - is_shared=is_shared, )) + is_shared=is_shared, + make_layer_name_in_submodel=False, )) parallel_nn = bool(int(g_command_config_args.get("parallel_nn", 0))) cudnn_version = int(g_command_config_args.get("cudnn_version", 0)) @@ -1880,7 +1909,7 @@ def __init__(self, # when either of it is non-zero. if input_layer.width != 0 or input_layer.height != 0: self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size, - image_conf.channels, True) + image_conf.channels, False) else: self.set_layer_size(input_layer.size) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index da951390c9558..9b6e5774bc82d 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -106,6 +106,7 @@ 'maxout_layer', 'out_prod_layer', 'print_layer', + 'priorbox_layer', 'spp_layer', ] @@ -171,6 +172,7 @@ class LayerType(object): SPP_LAYER = "spp" PRINT_LAYER = "print" + PRIORBOX_LAYER = "priorbox" CTC_LAYER = "ctc" WARP_CTC_LAYER = "warp_ctc" @@ -934,6 +936,52 @@ def print_layer(input, name=None): # this layer don't return anything, can not be input of other layer. +@wrap_name_default("priorbox") +def priorbox_layer(input, + image, + aspect_ratio, + variance, + min_size, + max_size=[], + name=None): + """ + Compute the priorbox and set the variance. This layer is necessary for ssd. + + :param name: The Layer Name. + :type name: basestring + :param input: The input layer. + :type input: LayerOutput + :param image: The network input image. + :type image: LayerOutput + :param aspect_ratio: The aspect ratio. + :type aspect_ratio: list + :param variance: The bounding box variance. + :type min_size: The min size of the priorbox width/height. + :param min_size: list + :type max_size: The max size of the priorbox width/height. Could be NULL. + :param max_size: list + :return: LayerOutput + """ + # plus one for ratio 1. + num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4 + size = (input.size / input.num_filters) * num_filters * 2 + Layer( + name=name, + type=LayerType.PRIORBOX_LAYER, + inputs=[input.name, image.name], + size=size, + min_size=min_size, + max_size=max_size, + aspect_ratio=aspect_ratio, + variance=variance) + return LayerOutput( + name, + LayerType.PRIORBOX_LAYER, + parents=[input, image], + num_filters=num_filters, + size=size) + + @wrap_name_default("seq_pooling") @wrap_bias_attr_default(has_bias=False) @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())