diff --git a/CMakeLists.txt b/CMakeLists.txt index a2e933c838a29..5697664e5b565 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -269,7 +269,7 @@ option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) option(WITH_HETERPS "Compile with heterps" OFF) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) -option(WITH_INFERENCE_NVTX "Paddle inference with nvtx for profiler" OFF) +option(WITH_NVTX "Paddle with nvtx for profiler" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option( @@ -623,6 +623,10 @@ if(WITH_MIPS) add_definitions(-DPADDLE_WITH_MIPS) endif() +if(WITH_NVTX AND NOT WIN32) + add_definitions(-DPADDLE_WITH_NVTX) +endif() + if(WITH_LOONGARCH) set(WITH_XBYAK OFF diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 34d1fb7dd46fc..aae30e9ea769f 100755 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -366,10 +366,6 @@ else() ) endif() -if(WITH_INFERENCE_NVTX AND NOT WIN32) - add_definitions(-DPADDLE_WITH_INFERENCE_NVTX) -endif() - copy( inference_lib_dist SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib} diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 40530c49ca78c..2a6b2daf2b1a2 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -661,37 +661,31 @@ cc_library( SRCS variable_helper.cc DEPS lod_tensor) +set(NAIVE_EXECUTOR_DEPS + op_registry + denormal + device_context + scope + framework_proto + glog + lod_rank_table + feed_fetch_method + graph_to_program_pass + variable_helper) + if(TENSORRT_FOUND) - cc_library( - naive_executor - SRCS naive_executor.cc - DEPS op_registry - denormal - device_context - scope - framework_proto - glog - lod_rank_table - feed_fetch_method - graph_to_program_pass - variable_helper - tensorrt_engine_op) -else() - cc_library( - naive_executor - SRCS naive_executor.cc - DEPS op_registry - denormal - device_context - scope - framework_proto - glog - lod_rank_table - feed_fetch_method - graph_to_program_pass - variable_helper) + set(NAIVE_EXECUTOR_DEPS ${NAIVE_EXECUTOR_DEPS} tensorrt_engine_op) +endif() + +if(WITH_NVTX AND NOT WIN32) + set(NAIVE_EXECUTOR_DEPS ${NAIVE_EXECUTOR_DEPS} cuda_profiler) endif() +cc_library( + naive_executor + SRCS naive_executor.cc + DEPS ${NAIVE_EXECUTOR_DEPS}) + cc_library( executor_gc_helper SRCS executor_gc_helper.cc diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 28cabf54ee4de..0b9467d0d9a39 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -28,7 +28,7 @@ #ifdef PADDLE_WITH_TENSORRT #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" #endif -#ifdef PADDLE_WITH_INFERENCE_NVTX +#ifdef PADDLE_WITH_NVTX #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" #endif @@ -54,14 +54,14 @@ void NaiveExecutor::Run() { platform::RegisterModelLayout(ops_, place_); #endif platform::ScopedFlushDenormal flush; -#ifdef PADDLE_WITH_INFERENCE_NVTX +#ifdef PADDLE_WITH_NVTX platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow); #endif for (auto &op : ops_) { VLOG(4) << std::this_thread::get_id() << " run " << op->DebugStringEx(scope_) << " on scope " << scope_; op->SetIsCalledByExecutor(false); -#ifdef PADDLE_WITH_INFERENCE_NVTX +#ifdef PADDLE_WITH_NVTX platform::CudaNvtxRangePush(op->Type() + "|" + op->OutputVars(true).front(), platform::NvtxRangeColor::Green); #endif @@ -98,14 +98,14 @@ void NaiveExecutor::Run() { } } -#ifdef PADDLE_WITH_INFERENCE_NVTX +#ifdef PADDLE_WITH_NVTX platform::CudaNvtxRangePop(); #endif for (auto &func : hookfuncs_) { func(op.get(), scope_); } } -#ifdef PADDLE_WITH_INFERENCE_NVTX +#ifdef PADDLE_WITH_NVTX platform::CudaNvtxRangePop(); #endif } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 598997360d533..16b0649ac5393 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -107,10 +107,6 @@ if(WITH_PSCORE) set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service) endif() -if(WITH_INFERENCE_NVTX AND NOT WIN32) - set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} cuda_profiler) -endif() - if(WITH_ONNXRUNTIME) set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS}