From 49e87ee327a7115cca08384e24e043fbe73413c5 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 27 Jun 2017 14:19:34 +0800
Subject: [PATCH] Change the CpuMatrix::copyFrom and CpuVector::copyFrom with
 the stream parameter to the synchronous interface.

---
 paddle/gserver/layers/Layer.cpp          |  5 +++++
 paddle/math/Matrix.cpp                   |  2 ++
 paddle/math/Matrix.h                     |  3 ++-
 paddle/math/Vector.cpp                   |  2 ++
 paddle/math/Vector.h                     |  8 ++++----
 paddle/math/tests/test_matrixCompare.cpp | 14 ++++++++++++++
 6 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 125aaf947f3c9..4b92b5d163ad1 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -191,6 +191,11 @@ void Layer::addOutputArgument(int deviceId) {
 void Layer::copyOutputToOtherDevice() {
   for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
     SetDevice device(outputOtherDevice_[i].deviceId);
+    // If outputOtherDevice_[i].value is a CpuMatrix,
+    // the copyFrom is a synchronous interface.
+    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
+    // calculations are all on HPPL_STREAM_DEFAULT,
+    // copyFrom can be an asynchronous interface.
     outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
                                           HPPL_STREAM_DEFAULT);
     outputOtherDevice_[i].sequenceStartPositions =
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index c910146164ebf..4431d613f655c 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1565,6 +1565,8 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
                     const_cast<real*>(src.getData()),
                     sizeof(real) * elementCnt_,
                     stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
   } else if (typeid(src) == typeid(CpuMatrix)) {
     memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
   } else {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 748be850b4c90..7dfd593225065 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -239,7 +239,8 @@ class Matrix : public BaseMatrix {
     LOG(FATAL) << "Not implemented";
   }
 
-  // asynchronous copy
+  // For GpuMatrix this is an asynchronous copy interface
+  // For CpuMatrix this is an synchronous copy interface
   virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
     LOG(FATAL) << "Not implemented";
   }
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index c519ca500afb1..eb87ee9bb7936 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -657,6 +657,8 @@ void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
                     (void*)src.getData(),
                     sizeof(T) * this->getSize(),
                     stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
   } else {
     src.copyTo(this);
   }
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 9af6e30c9e138..80b9775fccf10 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -168,11 +168,11 @@ class VectorT : public BaseVector<T> {
   virtual void copyFrom(const VectorT<T>& src) = 0;
 
   /**
-   * If use_gpu, this function will push the copy-task to the specifed-stream
-   * and return immediately.
+   * If GpuVector, this function is an asynchronous interface,
+   * will push the copy-task to the specifed-stream and return immediately.
    *
-   * If not use GPU, this function is same as
-   * the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.
+   * If CpuVector, this function is an synchronous interface,
+   * same as the copyFrom(const VectorT<T>& src).
    */
   virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
 
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 5a0dffe086c4e..354f58df39365 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1127,4 +1127,18 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
+TEST(CpuMatrix, copyFrom) {
+  const size_t height = 1000;
+  const size_t width = 1000;
+  CpuMatrix cpu(height, width);
+  GpuMatrix gpu(height, width);
+  CpuMatrix copy(height, width);
+
+  cpu.randomizeUniform();
+  gpu.copyFrom(cpu);
+  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
+
+  TensorCheckEqual(cpu, copy);
+}
+
 #endif