Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repair nccl op test #8575

Merged
merged 12 commits into from
Mar 13, 2018
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 19 additions & 53 deletions paddle/fluid/operators/nccl_op_test.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,24 @@ USE_CUDA_ONLY_OP(ncclBcast);
namespace f = paddle::framework;
namespace p = paddle::platform;

static std::vector<int> gpu_list;

// test data amount
const f::DDim kDims = {100, 100};
const f::DDim kDims = {20, 20};

// nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test {
public:
virtual void SetUp() override {
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
gpu_list.emplace_back(i);
}

paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list.size(); ++i) {
p::CUDAPlace place(i);
Expand All @@ -70,12 +79,16 @@ class NCCLTester : public ::testing::Test {
std::unique_ptr<f::OpDesc> op1(new f::OpDesc);

op1->SetType("ncclInit");
op1->SetInput("parallel_scopes", {"p_scopes"});
op1->SetOutput("Communicator", {"comm"});
op1->SetAttr("gpus", {gpu_list});

auto *var = g_scope.Var("comm");
var->GetMutable<p::Communicator>();

auto *scope_var = g_scope.Var("p_scopes");
auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
(*p_scopes).resize(gpu_list.size());

auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
Expand Down Expand Up @@ -121,27 +134,11 @@ class NCCLTester : public ::testing::Test {
std::vector<p::DeviceContext *> dev_ctxs;
f::Scope g_scope;
std::mutex mu;
std::vector<int> gpu_list;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Data members of classes should be with a trailing underscore.
refer : Variable Names

};

// ncclInitOp with desc
TEST(NCCL, ncclInitOp) {
std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);

op_desc->SetType("ncclInit");
op_desc->SetOutput("Communicator", {"x1"});
op_desc->SetAttr("gpus", {gpu_list});

f::Scope g_scope;
paddle::platform::CPUPlace cpu_place;

auto *var = g_scope.Var("x1");
var->GetMutable<p::Communicator>();

auto op = f::OpRegistry::CreateOp(*op_desc);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}
TEST_F(NCCLTester, ncclInitOp) {}

// ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) {
Expand Down Expand Up @@ -285,34 +282,3 @@ TEST_F(NCCLTester, ncclBcastOp) {
ASSERT_NEAR(ct[j], result, 1e-5);
}
}

int main(int argc, char **argv) {
// FIXME(tonyyang-svail):
// Due to the driver issue on our CI, disable for now
return 0;
const int dev_count = p::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}

std::vector<paddle::platform::Place> places;

places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
gpu_list.emplace_back(i);
}

VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);

testing::InitGoogleTest(&argc, argv);

// device context should be release before scope.
// otherwise driver will down.
return RUN_ALL_TESTS();
}
42 changes: 15 additions & 27 deletions paddle/fluid/platform/nccl_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,17 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"

static int dev_count = 0;

namespace paddle {
namespace platform {

TEST(NCCL, init) {
int dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
exit(0);
}
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
Expand Down Expand Up @@ -59,9 +64,16 @@ struct PerThreadData {
}
};

static constexpr int ELEM_COUNT = 10000;
static constexpr int ELEM_COUNT = 100;

TEST(NCCL, all_reduce) {
int dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
exit(0);
}
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
VLOG(1) << "Initializing ncclComm";
Expand Down Expand Up @@ -127,27 +139,3 @@ TEST(NCCL, all_reduce) {
}
} // namespace platform
} // namespace paddle

int main(int argc, char** argv) {
dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}

std::vector<paddle::platform::Place> places;

places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
}

VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);

testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}