Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repair nccl op test #8575

Merged
merged 12 commits into from
Mar 13, 2018
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context glog gtest)
1 change: 0 additions & 1 deletion paddle/fluid/operators/nccl_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ limitations under the License. */

#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"

namespace paddle {
namespace operators {
Expand Down
99 changes: 33 additions & 66 deletions paddle/fluid/operators/nccl_op_test.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,15 @@ limitations under the License. */

#include <glog/logging.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <memory>
#include <mutex>
#include <thread>
#include <utility>
#include <vector>

#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
Expand All @@ -41,15 +37,24 @@ USE_CUDA_ONLY_OP(ncclBcast);
namespace f = paddle::framework;
namespace p = paddle::platform;

static std::vector<int> gpu_list;

// test data amount
const f::DDim kDims = {100, 100};
const f::DDim kDims = {20, 20};

// nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test {
public:
virtual void SetUp() override {
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
gpu_list.emplace_back(i);
}

paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list.size(); ++i) {
p::CUDAPlace place(i);
Expand All @@ -70,18 +75,24 @@ class NCCLTester : public ::testing::Test {
std::unique_ptr<f::OpDesc> op1(new f::OpDesc);

op1->SetType("ncclInit");
op1->SetInput("parallel_scopes", {"p_scopes"});
op1->SetOutput("Communicator", {"comm"});
op1->SetAttr("gpus", {gpu_list});

auto *var = g_scope.Var("comm");
var->GetMutable<p::Communicator>();

auto *scope_var = g_scope.Var("p_scopes");
auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
(*p_scopes).resize(gpu_list.size());

auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}

int GetGPUData(int gpu_id) { return gpu_id + 42; }
Copy link

@tonyyang-svail tonyyang-svail Mar 8, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is necessary, because

Simply set GPU data = gpu_id won't expose the incorrect linking error. More specifically, Paddle might be compiled with NCCL1.3 header while dynamically linked with NCCL2.so.


template <class T>
void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
std::unique_lock<std::mutex> lk(mu);
Expand All @@ -97,7 +108,7 @@ class NCCLTester : public ::testing::Test {
send_tensor->Resize(kDims);
send_tensor->mutable_data<T>(kDims, place);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe line 108 is unnecessary.


std::vector<T> send_vector(f::product(kDims), gpu_id);
std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
ctx->Wait();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to synchronize here? I think the copying will synchronize the GPU and CPU in line 179.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The copying is cudaMemcpyAsync. Looks like we need to add a wait to line 179...

template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so because the memory is pageable in CPU side, the copying doesn't return immediately until the copy has completed.
The current CUDA Runtime Documentation states:
Asynchronous(Memcpy):
- For transfers from device memory to pageable host memory, the function will return only once the copy has completed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
Expand All @@ -121,27 +132,11 @@ class NCCLTester : public ::testing::Test {
std::vector<p::DeviceContext *> dev_ctxs;
f::Scope g_scope;
std::mutex mu;
std::vector<int> gpu_list;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Data members of classes should be with a trailing underscore.
refer : Variable Names

};

// ncclInitOp with desc
TEST(NCCL, ncclInitOp) {
std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);

op_desc->SetType("ncclInit");
op_desc->SetOutput("Communicator", {"x1"});
op_desc->SetAttr("gpus", {gpu_list});

f::Scope g_scope;
paddle::platform::CPUPlace cpu_place;

auto *var = g_scope.Var("x1");
var->GetMutable<p::Communicator>();

auto op = f::OpRegistry::CreateOp(*op_desc);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}
TEST_F(NCCLTester, ncclInitOp) {}

// ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) {
Expand All @@ -166,8 +161,10 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
ths[i].join();
}

// check results
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
float expected_result = 0.0;
for (int gpu_id : gpu_list) {
expected_result = expected_result + GetGPUData(gpu_id);
}

for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::CPUPlace cpu_place;
Expand All @@ -185,7 +182,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());

for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
ASSERT_NEAR(ct[j], expected_result, 1e-5);
}
}
}
Expand Down Expand Up @@ -215,8 +212,10 @@ TEST_F(NCCLTester, ncclReduceOp) {
ths[i].join();
}

// check results on
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
float expected_result = 0.0;
for (int gpu_id : gpu_list) {
expected_result = expected_result + GetGPUData(gpu_id);
}

p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[kRoot]);
Expand All @@ -234,7 +233,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());

for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
ASSERT_NEAR(ct[j], expected_result, 1e-5);
}
}

Expand Down Expand Up @@ -264,8 +263,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
}

const int idx = 1;
// check results on
float result = kRoot;
float result = GetGPUData(kRoot);

p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[idx]);
Expand All @@ -285,34 +283,3 @@ TEST_F(NCCLTester, ncclBcastOp) {
ASSERT_NEAR(ct[j], result, 1e-5);
}
}

int main(int argc, char **argv) {
// FIXME(tonyyang-svail):
// Due to the driver issue on our CI, disable for now
return 0;
const int dev_count = p::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}

std::vector<paddle::platform::Place> places;

places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
gpu_list.emplace_back(i);
}

VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);

testing::InitGoogleTest(&argc, argv);

// device context should be release before scope.
// otherwise driver will down.
return RUN_ALL_TESTS();
}
42 changes: 15 additions & 27 deletions paddle/fluid/platform/nccl_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,17 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"

static int dev_count = 0;

namespace paddle {
namespace platform {

TEST(NCCL, init) {
int dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
exit(0);
}
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
Expand Down Expand Up @@ -59,9 +64,16 @@ struct PerThreadData {
}
};

static constexpr int ELEM_COUNT = 10000;
static constexpr int ELEM_COUNT = 100;

TEST(NCCL, all_reduce) {
int dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
exit(0);
}
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
VLOG(1) << "Initializing ncclComm";
Expand Down Expand Up @@ -127,27 +139,3 @@ TEST(NCCL, all_reduce) {
}
} // namespace platform
} // namespace paddle

int main(int argc, char** argv) {
dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}

std::vector<paddle::platform::Place> places;

places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
}

VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);

testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}