Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize while_op for test #14764

Merged
merged 23 commits into from Jan 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7e02fc1
Simplify the compare op for CPU.
Xreki Dec 4, 2018
b65760b
Use asynchronous tensor copy in reshape_op's kernel.
Xreki Dec 4, 2018
4473895
Optimize while_op for test, avoiding creating variables every time.
Xreki Dec 4, 2018
9619043
Merge branch 'develop' into core_opt_while_op
Xreki Dec 6, 2018
3fda0b1
Enable the cache of kernel type and kernel function.
Xreki Dec 10, 2018
c3963f6
Merge branch 'develop' into core_opt_while_op
Xreki Dec 10, 2018
aec4e29
Enable profiling with gperftools.
Xreki Dec 10, 2018
4d1b624
Remove flags for testing, and fix the linking error.
Xreki Dec 11, 2018
b2aa81a
Merge branch 'develop' into core_opt_while_op
Xreki Dec 11, 2018
6c76cdd
Delete the codes of ChooseKernel.
Xreki Dec 11, 2018
d18af8b
Merge branch 'develop' into core_opt_while_op
Xreki Dec 13, 2018
375454a
Fix bug when preparing ExecutorPrepareContext for while_op.
Xreki Dec 14, 2018
ac191ba
Merge branch 'develop' into core_opt_while_op
Xreki Dec 14, 2018
b26bd96
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Xreki Dec 24, 2018
c4486fc
Merge branch 'develop' into core_opt_while_op
Xreki Dec 25, 2018
1286fd0
Merge branch 'develop' into core_opt_while_op
Xreki Dec 29, 2018
0944f7f
Fix missing depending on grpc libraries.
Xreki Jan 8, 2019
b52b770
Merge branch 'develop' into core_opt_while_op
Xreki Jan 8, 2019
135c17c
Merge branch 'develop' into core_opt_while_op
Xreki Jan 9, 2019
8476f7c
Remove the redundant print.
Xreki Jan 9, 2019
ad927fd
Merge branch 'develop' into core_opt_while_op
Xreki Jan 9, 2019
3871afa
Follow comments.
Xreki Jan 11, 2019
9f38bc3
Remove the codes related to prepare the ExecutorPrepareContext for wh…
Xreki Jan 15, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions cmake/generic.cmake
Expand Up @@ -91,9 +91,9 @@
include_directories(${CMAKE_CURRENT_BINARY_DIR})

if(NOT APPLE AND NOT ANDROID)
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID)

set_property(GLOBAL PROPERTY FLUID_MODULES "")
Expand Down Expand Up @@ -304,7 +304,7 @@ function(cc_library TARGET_NAME)
if(cc_library_DEPS)
merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
else()
message(FATAL "Please specify source file or library in cc_library.")
message(FATAL_ERROR "Please specify source files or libraries in cc_library(${TARGET_NAME} ...).")
endif()
endif(cc_library_SRCS)
endfunction(cc_library)
Expand Down
27 changes: 15 additions & 12 deletions paddle/fluid/inference/CMakeLists.txt
@@ -1,13 +1,6 @@
if(WITH_TESTING)
include(tests/test.cmake) # some generic cmake funtion for inference
endif()
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory(analysis)
add_subdirectory(utils)
if (TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()

set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)

Expand All @@ -16,6 +9,14 @@ cc_library(paddle_fluid_api
SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})

# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory(analysis)
add_subdirectory(utils)
if (TENSORRT_FOUND)
add_subdirectory(tensorrt)
endif()

get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)
Expand All @@ -40,10 +41,10 @@ set(SHARED_INFERENCE_SRCS

if(WIN32)
sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder)
analysis_config paddle_pass_builder)
else(WIN32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
endif(WIN32)

if(NOT APPLE)
Expand All @@ -55,11 +56,13 @@ endif()
# Create shared library
if(WIN32)
sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder)
target_link_libraries(paddle_fluid_shared shlwapi)
else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder)
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder)
endif()

set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
Expand Down
11 changes: 6 additions & 5 deletions paddle/fluid/inference/api/CMakeLists.txt
Expand Up @@ -18,21 +18,22 @@ if(APPLE)
endif(APPLE)


set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass
ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})

if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
endif()

cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
add_subdirectory(details)

cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor reset_tensor_array)
analysis_config paddle_pass_builder zero_copy_tensor
reset_tensor_array)

cc_test(test_paddle_inference_api
SRCS api_tester.cc
Expand Down
18 changes: 18 additions & 0 deletions paddle/fluid/inference/api/details/CMakeLists.txt
@@ -0,0 +1,18 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope)
cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce)
cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
11 changes: 10 additions & 1 deletion paddle/fluid/inference/tests/api/tester_helper.h
Expand Up @@ -19,6 +19,9 @@
#include <string>
#include <thread> // NOLINT
#include <vector>
#ifdef WITH_GPERFTOOLS
#include <gperftools/profiler.h>
#endif

#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/scope.h"
Expand Down Expand Up @@ -215,13 +218,19 @@ void TestOneThreadPrediction(
{
Timer run_timer;
run_timer.tic();
#ifdef WITH_GPERFTOOLS
ProfilerStart("paddle_inference.prof");
#endif
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs.size(); j++) {
predictor->Run(inputs[j], outputs, batch_size);
}
}
#ifdef WITH_GPERFTOOLS
ProfilerStop();
#endif

double latency = run_timer.toc() / num_times;
double latency = run_timer.toc() / (num_times > 1 ? num_times : 1);
PrintTime(batch_size, num_times, 1, 0, latency, inputs.size());
if (FLAGS_record_benchmark) {
Benchmark benchmark;
Expand Down
28 changes: 26 additions & 2 deletions paddle/fluid/operators/controlflow/compare_op.cc
Expand Up @@ -18,6 +18,30 @@ limitations under the License. */

namespace paddle {
namespace operators {

template <typename Functor>
class CompareOpKernel<platform::CPUDeviceContext, Functor>
: public framework::OpKernel<typename Functor::ELEM_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEM_TYPE;
using Tensor = framework::Tensor;

auto* x = context.Input<Tensor>("X");
auto* y = context.Input<Tensor>("Y");
auto* z = context.Output<Tensor>("Out");
int axis = context.Attr<int>("axis");

if (x->numel() == 1 && y->numel() == 1) {
bool* z_data = z->mutable_data<bool>(context.GetPlace());
z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
} else {
ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
context, x, y, axis, Functor(), z);
}
}
};

template <typename OpComment>
class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
Expand Down Expand Up @@ -51,7 +75,7 @@ calculated by $%s$
template <typename OpComment>
class CompareOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
void operator()(framework::InferShapeContext* context) const override {
OpComment comment;
PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
comment.type);
Expand All @@ -73,7 +97,7 @@ class CompareOp : public framework::OperatorWithKernel {

protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const framework::ExecutionContext& ctx) const override {
framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// CompareOp kernel's device type is decided by input tensor place
bool force_cpu = ctx.Attr<bool>("force_cpu");
Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/operators/controlflow/feed_op.cc
Expand Up @@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/profiler.h"

namespace paddle {
namespace operators {
Expand Down
22 changes: 16 additions & 6 deletions paddle/fluid/operators/controlflow/while_op.cc
Expand Up @@ -58,6 +58,7 @@ class WhileOp : public framework::OperatorBase {
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));

auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));

Expand All @@ -72,18 +73,27 @@ class WhileOp : public framework::OperatorBase {
PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
"Condition of while op must in CPU memory.");

bool is_test = Attr<bool>("is_test");
auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);

bool is_test = Attr<bool>("is_test");
auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
while (cond.data<bool>()[0]) {

if (!is_test) {
while (cond.data<bool>()[0]) {
auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope);
executor.RunPreparedContext(ctx.get(), &current_scope, false, true,
true);
}
} else {
auto &current_scope = scope.NewScope();
step_scopes->push_back(&current_scope);
executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
if (is_test) {
scope.DeleteScope(&current_scope);
executor.CreateVariables(*program, &current_scope, block->ID());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why inference doesn't need to create variables but train need?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars,
bool keep_kids) {
PADDLE_ENFORCE_NOT_NULL(scope);
Scope* local_scope = scope;
if (create_vars) {
if (create_local_scope) {
local_scope = &scope->NewScope();
}
CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
}

When create_vars is set to true, variables will be created in RunPreparedContext. Because the step scopes will be used in while_grad_op, so all the step scopes should be saved. So that if there is 100 iterations, CreatedVariables will be called 100 times.

if (is_test) {
scope.DeleteScope(&current_scope);
}

For test, the step scopes is deleted at the end of the iteration. In fact, there is no need to new 100 step scopes and call CreateVariables 100 times. Instead, 1 time is enough. This can reduce the overhead a lot, from 17.8619ms to 16.0768ms.

while (cond.data<bool>()[0]) {
executor.RunPreparedContext(ctx.get(), &current_scope, false, false,
false);
}
scope.DeleteScope(&current_scope);
}
}
};
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/operators/distributed/CMakeLists.txt
Expand Up @@ -12,17 +12,18 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto @O
# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
if(WITH_GRPC)
set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc
variable_response.cc
collective_client.cc collective_server.cc
${GRPC_SRCS}
PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto
DEPS lod_tensor selected_rows_functor memory)
DEPS lod_tensor selected_rows_functor memory ${GRPC_DEPS})

set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set(RPC_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})

cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc
DEPS ${RPC_DEPS} scope profiler math_function SERIAL)
Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/operators/reshape_op.cc
Expand Up @@ -226,7 +226,9 @@ class ReshapeKernel {
}

out->mutable_data(ctx.GetPlace(), in->type());
framework::TensorCopySync(*in, ctx.GetPlace(), out);
framework::TensorCopy(
*in, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), out);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@velconia why was this copy sync?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is a copied from original implementation, but I guess Sync is NO need here.

out->Resize(out_dims);
}
};
Expand Down