PaddlePaddle · QiJune · Mar 13, 2018 · Feb 26, 2018 · Feb 26, 2018 · Feb 26, 2018
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -199,8 +199,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
-if(WITH_GPU)
-    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
-endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
+nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context glog gtest)
diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 
 namespace paddle {
 namespace operators {

diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -14,19 +14,15 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <algorithm>
 #include <memory>
 #include <mutex>
 #include <thread>
-#include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -41,15 +37,24 @@ USE_CUDA_ONLY_OP(ncclBcast);
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-static std::vector<int> gpu_list;
-
 // test data amount
-const f::DDim kDims = {100, 100};
+const f::DDim kDims = {20, 20};
 
 // nccl op common tester, init communicator.
 class NCCLTester : public ::testing::Test {
  public:
   virtual void SetUp() override {
+    int count = p::GetCUDADeviceCount();
+    if (count <= 1) {
+      LOG(WARNING)
+          << "Cannot test multi-gpu nccl, because the CUDA device count is "
+          << count;
+      exit(0);
+    }
+    for (int i = 0; i < count; ++i) {
+      gpu_list.emplace_back(i);
+    }
+
     paddle::platform::CPUPlace cpu_place;
     for (size_t i = 0; i < gpu_list.size(); ++i) {
       p::CUDAPlace place(i);
@@ -70,18 +75,24 @@ class NCCLTester : public ::testing::Test {
     std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
 
     op1->SetType("ncclInit");
+    op1->SetInput("parallel_scopes", {"p_scopes"});
     op1->SetOutput("Communicator", {"comm"});
-    op1->SetAttr("gpus", {gpu_list});
 
     auto *var = g_scope.Var("comm");
     var->GetMutable<p::Communicator>();
 
+    auto *scope_var = g_scope.Var("p_scopes");
+    auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
+    (*p_scopes).resize(gpu_list.size());
+
     auto op = f::OpRegistry::CreateOp(*op1);
     VLOG(1) << "invoke NCCLInitOp.";
     op->Run(g_scope, cpu_place);
     VLOG(1) << "NCCLInitOp finished.";
   }
 
+  int GetGPUData(int gpu_id) { return gpu_id + 42; }
+
   template <class T>
   void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
     std::unique_lock<std::mutex> lk(mu);
@@ -97,7 +108,7 @@ class NCCLTester : public ::testing::Test {
       send_tensor->Resize(kDims);
       send_tensor->mutable_data<T>(kDims, place);
 
-      std::vector<T> send_vector(f::product(kDims), gpu_id);
+      std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
       paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
       ctx->Wait();
 template <> 
 void Copy<platform::CPUPlace, platform::CUDAPlace>( 
     platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, 
     const void* src, size_t num, cudaStream_t stream) { 
   platform::SetDeviceId(src_place.device); 
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); 
 } 
 template <> 
 void Copy<platform::CPUPlace, platform::CUDAPlace>( 
     platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, 
     const void* src, size_t num, cudaStream_t stream) { 
   platform::SetDeviceId(src_place.device); 
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); 
 } 
       VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
@@ -121,27 +132,11 @@ class NCCLTester : public ::testing::Test {
   std::vector<p::DeviceContext *> dev_ctxs;
   f::Scope g_scope;
   std::mutex mu;
+  std::vector<int> gpu_list;
 };
 
 // ncclInitOp with desc
-TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
-
-  op_desc->SetType("ncclInit");
-  op_desc->SetOutput("Communicator", {"x1"});
-  op_desc->SetAttr("gpus", {gpu_list});
-
-  f::Scope g_scope;
-  paddle::platform::CPUPlace cpu_place;
-
-  auto *var = g_scope.Var("x1");
-  var->GetMutable<p::Communicator>();
-
-  auto op = f::OpRegistry::CreateOp(*op_desc);
-  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, cpu_place);
-  VLOG(1) << "NCCLInitOp finished.";
-}
+TEST_F(NCCLTester, ncclInitOp) {}
 
 // ncclAllReduceOp with desc
 TEST_F(NCCLTester, ncclAllReduceOp) {
@@ -166,8 +161,10 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     ths[i].join();
   }
 
-  // check results
-  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+  float expected_result = 0.0;
+  for (int gpu_id : gpu_list) {
+    expected_result = expected_result + GetGPUData(gpu_id);
+  }
 
   for (size_t i = 0; i < dev_scopes.size(); ++i) {
     p::CPUPlace cpu_place;
@@ -185,7 +182,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
         static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
 
     for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], result, 1e-5);
+      ASSERT_NEAR(ct[j], expected_result, 1e-5);
     }
   }
 }
@@ -215,8 +212,10 @@ TEST_F(NCCLTester, ncclReduceOp) {
     ths[i].join();
   }
 
-  // check results on
-  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+  float expected_result = 0.0;
+  for (int gpu_id : gpu_list) {
+    expected_result = expected_result + GetGPUData(gpu_id);
+  }
 
   p::CPUPlace cpu_place;
   p::CUDAPlace gpu_place(gpu_list[kRoot]);
@@ -234,7 +233,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
       static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
 
   for (int64_t j = 0; j < f::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], result, 1e-5);
+    ASSERT_NEAR(ct[j], expected_result, 1e-5);
   }
 }
 
@@ -264,8 +263,7 @@ TEST_F(NCCLTester, ncclBcastOp) {
   }
 
   const int idx = 1;
-  // check results on
-  float result = kRoot;
+  float result = GetGPUData(kRoot);
 
   p::CPUPlace cpu_place;
   p::CUDAPlace gpu_place(gpu_list[idx]);
@@ -285,34 +283,3 @@ TEST_F(NCCLTester, ncclBcastOp) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
-
-int main(int argc, char **argv) {
-  // FIXME(tonyyang-svail):
-  //   Due to the driver issue on our CI, disable for now
-  return 0;
-  const int dev_count = p::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-    gpu_list.emplace_back(i);
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-
-  // device context should be release before scope.
-  // otherwise driver will down.
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/fluid/platform/nccl_test.cu b/paddle/fluid/platform/nccl_test.cu
@@ -25,12 +25,17 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
-static int dev_count = 0;
-
 namespace paddle {
 namespace platform {
 
 TEST(NCCL, init) {
+  int dev_count = paddle::platform::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING)
+        << "Cannot test multi-gpu nccl, because the CUDA device count is "
+        << dev_count;
+    exit(0);
+  }
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
   PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
@@ -59,9 +64,16 @@ struct PerThreadData {
   }
 };
 
-static constexpr int ELEM_COUNT = 10000;
+static constexpr int ELEM_COUNT = 100;
 
 TEST(NCCL, all_reduce) {
+  int dev_count = paddle::platform::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING)
+        << "Cannot test multi-gpu nccl, because the CUDA device count is "
+        << dev_count;
+    exit(0);
+  }
   std::vector<ncclComm_t> comms;
   comms.resize(dev_count);
   VLOG(1) << "Initializing ncclComm";
@@ -127,27 +139,3 @@ TEST(NCCL, all_reduce) {
 }
 }  // namespace platform
 }  // namespace paddle
-
-int main(int argc, char** argv) {
-  dev_count = paddle::platform::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}