Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repair nccl op test #8575

Merged
merged 12 commits into from
Mar 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -242,11 +242,11 @@ function(cc_test TARGET_NAME)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
Expand Down Expand Up @@ -308,8 +308,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME})
endif()
endfunction(nv_test)
Expand Down
4 changes: 1 addition & 3 deletions paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
1 change: 0 additions & 1 deletion paddle/fluid/operators/nccl_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ limitations under the License. */

#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"

namespace paddle {
namespace operators {
Expand Down
163 changes: 64 additions & 99 deletions paddle/fluid/operators/nccl_op_test.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,15 @@ limitations under the License. */

#include <glog/logging.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <memory>
#include <mutex>
#include <thread>
#include <utility>
#include <vector>

#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
Expand All @@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast);
namespace f = paddle::framework;
namespace p = paddle::platform;

static std::vector<int> gpu_list;

// test data amount
const f::DDim kDims = {100, 100};
const f::DDim kDims = {20, 20};

// nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test {
public:
virtual void SetUp() override {
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
gpu_list_.emplace_back(i);
}

paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
p::CUDAPlace place(i);
dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
}

NCCLInitOp();
}

virtual void TearDown() override {
for (auto &device_context : dev_ctxs) {
for (auto &device_context : dev_ctxs_) {
delete device_context;
}
}
Expand All @@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test {
std::unique_ptr<f::OpDesc> op1(new f::OpDesc);

op1->SetType("ncclInit");
op1->SetInput("parallel_scopes", {"p_scopes"});
op1->SetOutput("Communicator", {"comm"});
op1->SetAttr("gpus", {gpu_list});

auto *var = g_scope.Var("comm");
auto *var = g_scope_.Var("comm");
var->GetMutable<p::Communicator>();

auto *scope_var = g_scope_.Var("p_scopes");
auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
(*p_scopes).resize(gpu_list_.size());

auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
op->Run(g_scope_, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}

int GetGPUData(int gpu_id) { return gpu_id + 42; }

template <class T>
void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
std::unique_lock<std::mutex> lk(mu);
std::unique_lock<std::mutex> lk(mu_);
const f::OpDesc *op1 = &op_desc;

p::CUDAPlace place(gpu_id);
auto &ctx = dev_ctxs.at(gpu_id);
auto &ctx = dev_ctxs_.at(gpu_id);

auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();

if (!send_tensor->numel()) {
send_tensor->Resize(kDims);
send_tensor->mutable_data<T>(kDims, place);

std::vector<T> send_vector(f::product(kDims), gpu_id);
std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
ctx->Wait();
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
}

Expand All @@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test {
}

public:
std::vector<p::DeviceContext *> dev_ctxs;
f::Scope g_scope;
std::mutex mu;
std::vector<p::DeviceContext *> dev_ctxs_;
f::Scope g_scope_;
std::mutex mu_;
std::vector<int> gpu_list_;
};

// ncclInitOp with desc
TEST(NCCL, ncclInitOp) {
std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);

op_desc->SetType("ncclInit");
op_desc->SetOutput("Communicator", {"x1"});
op_desc->SetAttr("gpus", {gpu_list});

f::Scope g_scope;
paddle::platform::CPUPlace cpu_place;

auto *var = g_scope.Var("x1");
var->GetMutable<p::Communicator>();

auto op = f::OpRegistry::CreateOp(*op_desc);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}
TEST_F(NCCLTester, ncclInitOp) {}

// ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) {
Expand All @@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) {

std::vector<std::thread> ths;

for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}

for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join();
}

// check results
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
float expected_result = 0.0;
for (int gpu_id : gpu_list_) {
expected_result = expected_result + GetGPUData(gpu_id);
}

for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[i]);
p::CUDAPlace gpu_place(gpu_list_[i]);

auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
Expand All @@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place);

paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());

for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
ASSERT_NEAR(ct[j], expected_result, 1e-5);
}
}
}
Expand All @@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) {

std::vector<std::thread> ths;

for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}

for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join();
}

// check results on
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
float expected_result = 0.0;
for (int gpu_id : gpu_list_) {
expected_result = expected_result + GetGPUData(gpu_id);
}

p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[kRoot]);
p::CUDAPlace gpu_place(gpu_list_[kRoot]);

auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
Expand All @@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place);

paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());

for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
ASSERT_NEAR(ct[j], expected_result, 1e-5);
}
}

Expand All @@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) {

std::vector<std::thread> ths;

for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}

for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join();
}

const int idx = 1;
// check results on
float result = kRoot;
float result = GetGPUData(kRoot);

p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[idx]);
p::CUDAPlace gpu_place(gpu_list_[idx]);

auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
Expand All @@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place);

paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());

for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
}
}

int main(int argc, char **argv) {
// FIXME(tonyyang-svail):
// Due to the driver issue on our CI, disable for now
return 0;
const int dev_count = p::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}

std::vector<paddle::platform::Place> places;

places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
gpu_list.emplace_back(i);
}

VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);

testing::InitGoogleTest(&argc, argv);

// device context should be release before scope.
// otherwise driver will down.
return RUN_ALL_TESTS();
}
1 change: 0 additions & 1 deletion paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_

nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)

cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
Expand Down
Loading