Skip to content

Commit

Permalink
Repair nccl op test (#8575)
Browse files Browse the repository at this point in the history
* fix nccl op unit test

* fix build error

* format code

* refine nccl related unit test

* fix build error

* add setGPUData

* clean up

* follow comments

* rm test_nccl.cu

* follow comment

* rm wait
  • Loading branch information
QiJune committed Mar 13, 2018
1 parent ada82a3 commit 7287630
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 261 deletions.
8 changes: 4 additions & 4 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,11 @@ function(cc_test TARGET_NAME)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
Expand Down Expand Up @@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME})
endif()
endfunction(nv_test)
Expand Down
4 changes: 1 addition & 3 deletions paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
1 change: 0 additions & 1 deletion paddle/fluid/operators/nccl_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ limitations under the License. */

#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"

namespace paddle {
namespace operators {
Expand Down
163 changes: 64 additions & 99 deletions paddle/fluid/operators/nccl_op_test.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,15 @@ limitations under the License. */

#include <glog/logging.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <memory>
#include <mutex>
#include <thread>
#include <utility>
#include <vector>

#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
Expand All @@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast);
namespace f = paddle::framework;
namespace p = paddle::platform;

static std::vector<int> gpu_list;

// test data amount
const f::DDim kDims = {100, 100};
const f::DDim kDims = {20, 20};

// nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test {
public:
virtual void SetUp() override {
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
gpu_list_.emplace_back(i);
}

paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
p::CUDAPlace place(i);
dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
}

NCCLInitOp();
}

virtual void TearDown() override {
for (auto &device_context : dev_ctxs) {
for (auto &device_context : dev_ctxs_) {
delete device_context;
}
}
Expand All @@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test {
std::unique_ptr<f::OpDesc> op1(new f::OpDesc);

op1->SetType("ncclInit");
op1->SetInput("parallel_scopes", {"p_scopes"});
op1->SetOutput("Communicator", {"comm"});
op1->SetAttr("gpus", {gpu_list});

auto *var = g_scope.Var("comm");
auto *var = g_scope_.Var("comm");
var->GetMutable<p::Communicator>();

auto *scope_var = g_scope_.Var("p_scopes");
auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
(*p_scopes).resize(gpu_list_.size());

auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
op->Run(g_scope_, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}

int GetGPUData(int gpu_id) { return gpu_id + 42; }

template <class T>
void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
std::unique_lock<std::mutex> lk(mu);
std::unique_lock<std::mutex> lk(mu_);
const f::OpDesc *op1 = &op_desc;

p::CUDAPlace place(gpu_id);
auto &ctx = dev_ctxs.at(gpu_id);
auto &ctx = dev_ctxs_.at(gpu_id);

auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();

if (!send_tensor->numel()) {
send_tensor->Resize(kDims);
send_tensor->mutable_data<T>(kDims, place);

std::vector<T> send_vector(f::product(kDims), gpu_id);
std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
ctx->Wait();
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
}

Expand All @@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test {
}

public:
std::vector<p::DeviceContext *> dev_ctxs;
f::Scope g_scope;
std::mutex mu;
std::vector<p::DeviceContext *> dev_ctxs_;
f::Scope g_scope_;
std::mutex mu_;
std::vector<int> gpu_list_;
};

// ncclInitOp with desc
TEST(NCCL, ncclInitOp) {
std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);

op_desc->SetType("ncclInit");
op_desc->SetOutput("Communicator", {"x1"});
op_desc->SetAttr("gpus", {gpu_list});

f::Scope g_scope;
paddle::platform::CPUPlace cpu_place;

auto *var = g_scope.Var("x1");
var->GetMutable<p::Communicator>();

auto op = f::OpRegistry::CreateOp(*op_desc);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}
TEST_F(NCCLTester, ncclInitOp) {}

// ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) {
Expand All @@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) {

std::vector<std::thread> ths;

for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}

for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join();
}

// check results
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
float expected_result = 0.0;
for (int gpu_id : gpu_list_) {
expected_result = expected_result + GetGPUData(gpu_id);
}

for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[i]);
p::CUDAPlace gpu_place(gpu_list_[i]);

auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
Expand All @@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place);

paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());

for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
ASSERT_NEAR(ct[j], expected_result, 1e-5);
}
}
}
Expand All @@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) {

std::vector<std::thread> ths;

for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}

for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join();
}

// check results on
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
float expected_result = 0.0;
for (int gpu_id : gpu_list_) {
expected_result = expected_result + GetGPUData(gpu_id);
}

p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[kRoot]);
p::CUDAPlace gpu_place(gpu_list_[kRoot]);

auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
Expand All @@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place);

paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());

for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
ASSERT_NEAR(ct[j], expected_result, 1e-5);
}
}

Expand All @@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) {

std::vector<std::thread> ths;

for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}

for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join();
}

const int idx = 1;
// check results on
float result = kRoot;
float result = GetGPUData(kRoot);

p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[idx]);
p::CUDAPlace gpu_place(gpu_list_[idx]);

auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
Expand All @@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place);

paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());

for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
}
}

int main(int argc, char **argv) {
// FIXME(tonyyang-svail):
// Due to the driver issue on our CI, disable for now
return 0;
const int dev_count = p::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}

std::vector<paddle::platform::Place> places;

places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
gpu_list.emplace_back(i);
}

VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);

testing::InitGoogleTest(&argc, argv);

// device context should be release before scope.
// otherwise driver will down.
return RUN_ALL_TESTS();
}
1 change: 0 additions & 1 deletion paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_

nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)

cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
Expand Down
Loading

0 comments on commit 7287630

Please sign in to comment.