PaddlePaddle · QiJune · Mar 13, 2018 · Feb 26, 2018 · Feb 26, 2018 · Feb 26, 2018
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -242,11 +242,11 @@ function(cc_test TARGET_NAME)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
     if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
       list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
     endif()
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
@@ -308,8 +308,8 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(nv_test)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -199,8 +199,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
-if(WITH_GPU)
-    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
-endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
+nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 
 namespace paddle {
 namespace operators {

diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -14,19 +14,15 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include <algorithm>
 #include <memory>
 #include <mutex>
 #include <thread>
-#include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast);
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-static std::vector<int> gpu_list;
-
 // test data amount
-const f::DDim kDims = {100, 100};
+const f::DDim kDims = {20, 20};
 
 // nccl op common tester, init communicator.
 class NCCLTester : public ::testing::Test {
  public:
   virtual void SetUp() override {
+    int count = p::GetCUDADeviceCount();
+    if (count <= 1) {
+      LOG(WARNING)
+          << "Cannot test multi-gpu nccl, because the CUDA device count is "
+          << count;
+      exit(0);
+    }
+    for (int i = 0; i < count; ++i) {
+      gpu_list_.emplace_back(i);
+    }
+
     paddle::platform::CPUPlace cpu_place;
-    for (size_t i = 0; i < gpu_list.size(); ++i) {
+    for (size_t i = 0; i < gpu_list_.size(); ++i) {
       p::CUDAPlace place(i);
-      dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
+      dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
     }
 
     NCCLInitOp();
   }
 
   virtual void TearDown() override {
-    for (auto &device_context : dev_ctxs) {
+    for (auto &device_context : dev_ctxs_) {
       delete device_context;
     }
   }
@@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test {
     std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
 
     op1->SetType("ncclInit");
+    op1->SetInput("parallel_scopes", {"p_scopes"});
     op1->SetOutput("Communicator", {"comm"});
-    op1->SetAttr("gpus", {gpu_list});
 
-    auto *var = g_scope.Var("comm");
+    auto *var = g_scope_.Var("comm");
     var->GetMutable<p::Communicator>();
 
+    auto *scope_var = g_scope_.Var("p_scopes");
+    auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
+    (*p_scopes).resize(gpu_list_.size());
+
     auto op = f::OpRegistry::CreateOp(*op1);
     VLOG(1) << "invoke NCCLInitOp.";
-    op->Run(g_scope, cpu_place);
+    op->Run(g_scope_, cpu_place);
     VLOG(1) << "NCCLInitOp finished.";
   }
 
+  int GetGPUData(int gpu_id) { return gpu_id + 42; }
+
   template <class T>
   void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
-    std::unique_lock<std::mutex> lk(mu);
+    std::unique_lock<std::mutex> lk(mu_);
     const f::OpDesc *op1 = &op_desc;
 
     p::CUDAPlace place(gpu_id);
-    auto &ctx = dev_ctxs.at(gpu_id);
+    auto &ctx = dev_ctxs_.at(gpu_id);
 
     auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
     auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
 
     if (!send_tensor->numel()) {
-      send_tensor->Resize(kDims);
       send_tensor->mutable_data<T>(kDims, place);
 
-      std::vector<T> send_vector(f::product(kDims), gpu_id);
+      std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
       paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
-      ctx->Wait();
       VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
     }
 
@@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test {
   }
 
  public:
-  std::vector<p::DeviceContext *> dev_ctxs;
-  f::Scope g_scope;
-  std::mutex mu;
+  std::vector<p::DeviceContext *> dev_ctxs_;
+  f::Scope g_scope_;
+  std::mutex mu_;
+  std::vector<int> gpu_list_;
 };
 
 // ncclInitOp with desc
-TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
-
-  op_desc->SetType("ncclInit");
-  op_desc->SetOutput("Communicator", {"x1"});
-  op_desc->SetAttr("gpus", {gpu_list});
-
-  f::Scope g_scope;
-  paddle::platform::CPUPlace cpu_place;
-
-  auto *var = g_scope.Var("x1");
-  var->GetMutable<p::Communicator>();
-
-  auto op = f::OpRegistry::CreateOp(*op_desc);
-  VLOG(1) << "invoke NCCLInitOp.";
-  op->Run(g_scope, cpu_place);
-  VLOG(1) << "NCCLInitOp finished.";
-}
+TEST_F(NCCLTester, ncclInitOp) {}
 
 // ncclAllReduceOp with desc
 TEST_F(NCCLTester, ncclAllReduceOp) {
@@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
 
   std::vector<std::thread> ths;
 
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+  for (size_t i = 0; i < gpu_list_.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope_.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
                    *op2.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));
   }
 
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
+  for (size_t i = 0; i < gpu_list_.size(); ++i) {
     ths[i].join();
   }
 
-  // check results
-  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+  float expected_result = 0.0;
+  for (int gpu_id : gpu_list_) {
+    expected_result = expected_result + GetGPUData(gpu_id);
+  }
 
   for (size_t i = 0; i < dev_scopes.size(); ++i) {
     p::CPUPlace cpu_place;
-    p::CUDAPlace gpu_place(gpu_list[i]);
+    p::CUDAPlace gpu_place(gpu_list_[i]);
 
     auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
     auto *rt = recv_tensor.data<float>();
@@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
     paddle::memory::Copy(
-        cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
+        cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
         recv_tensor.numel() * sizeof(float),
-        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+        static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());
 
     for (int64_t j = 0; j < f::product(kDims); ++j) {
-      ASSERT_NEAR(ct[j], result, 1e-5);
+      ASSERT_NEAR(ct[j], expected_result, 1e-5);
     }
   }
 }
@@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) {
 
   std::vector<std::thread> ths;
 
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+  for (size_t i = 0; i < gpu_list_.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope_.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
                    *op2.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));
   }
 
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
+  for (size_t i = 0; i < gpu_list_.size(); ++i) {
     ths[i].join();
   }
 
-  // check results on
-  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+  float expected_result = 0.0;
+  for (int gpu_id : gpu_list_) {
+    expected_result = expected_result + GetGPUData(gpu_id);
+  }
 
   p::CPUPlace cpu_place;
-  p::CUDAPlace gpu_place(gpu_list[kRoot]);
+  p::CUDAPlace gpu_place(gpu_list_[kRoot]);
 
   auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
   auto *rt = recv_tensor.data<float>();
@@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) {
   auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
   paddle::memory::Copy(
-      cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
+      cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
       recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
+      static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());
 
   for (int64_t j = 0; j < f::product(kDims); ++j) {
-    ASSERT_NEAR(ct[j], result, 1e-5);
+    ASSERT_NEAR(ct[j], expected_result, 1e-5);
   }
 }
 
@@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) {
 
   std::vector<std::thread> ths;
 
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
-    dev_scopes.emplace_back(&g_scope.NewScope());
-    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+  for (size_t i = 0; i < gpu_list_.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope_.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
                    *op2.get(), dev_scopes[i]);
     ths.emplace_back(std::move(th));
   }
 
-  for (size_t i = 0; i < gpu_list.size(); ++i) {
+  for (size_t i = 0; i < gpu_list_.size(); ++i) {
     ths[i].join();
   }
 
   const int idx = 1;
-  // check results on
-  float result = kRoot;
+  float result = GetGPUData(kRoot);
 
   p::CPUPlace cpu_place;
-  p::CUDAPlace gpu_place(gpu_list[idx]);
+  p::CUDAPlace gpu_place(gpu_list_[idx]);
 
   auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
   auto *rt = recv_tensor.data<float>();
@@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) {
   auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
   paddle::memory::Copy(
-      cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
+      cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
       recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
+      static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
 
   for (int64_t j = 0; j < f::product(kDims); ++j) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
-
-int main(int argc, char **argv) {
-  // FIXME(tonyyang-svail):
-  //   Due to the driver issue on our CI, disable for now
-  return 0;
-  const int dev_count = p::GetCUDADeviceCount();
-  if (dev_count <= 1) {
-    LOG(WARNING)
-        << "Cannot test multi-gpu nccl, because the CUDA device count is "
-        << dev_count;
-    return 0;
-  }
-
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-    gpu_list.emplace_back(i);
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-
-  // device context should be release before scope.
-  // otherwise driver will down.
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
@@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
-nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
 
 cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)