From 49e87ee327a7115cca08384e24e043fbe73413c5 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Tue, 27 Jun 2017 14:19:34 +0800 Subject: [PATCH] Change the CpuMatrix::copyFrom and CpuVector::copyFrom with the stream parameter to the synchronous interface. --- paddle/gserver/layers/Layer.cpp | 5 +++++ paddle/math/Matrix.cpp | 2 ++ paddle/math/Matrix.h | 3 ++- paddle/math/Vector.cpp | 2 ++ paddle/math/Vector.h | 8 ++++---- paddle/math/tests/test_matrixCompare.cpp | 14 ++++++++++++++ 6 files changed, 29 insertions(+), 5 deletions(-) diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp index 125aaf947f3c9..4b92b5d163ad1 100644 --- a/paddle/gserver/layers/Layer.cpp +++ b/paddle/gserver/layers/Layer.cpp @@ -191,6 +191,11 @@ void Layer::addOutputArgument(int deviceId) { void Layer::copyOutputToOtherDevice() { for (size_t i = 0; i != outputOtherDevice_.size(); i++) { SetDevice device(outputOtherDevice_[i].deviceId); + // If outputOtherDevice_[i].value is a CpuMatrix, + // the copyFrom is a synchronous interface. + // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent + // calculations are all on HPPL_STREAM_DEFAULT, + // copyFrom can be an asynchronous interface. outputOtherDevice_[i].value->copyFrom(*getOutputValue(), HPPL_STREAM_DEFAULT); outputOtherDevice_[i].sequenceStartPositions = diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index c910146164ebf..4431d613f655c 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -1565,6 +1565,8 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) { const_cast(src.getData()), sizeof(real) * elementCnt_, stream); + // There is a need to add synchronization to ensure that the data is copied. + hl_stream_synchronize(stream); } else if (typeid(src) == typeid(CpuMatrix)) { memcpy(data_, src.getData(), sizeof(real) * elementCnt_); } else { diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 748be850b4c90..7dfd593225065 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -239,7 +239,8 @@ class Matrix : public BaseMatrix { LOG(FATAL) << "Not implemented"; } - // asynchronous copy + // For GpuMatrix this is an asynchronous copy interface + // For CpuMatrix this is an synchronous copy interface virtual void copyFrom(const Matrix& src, hl_stream_t stream) { LOG(FATAL) << "Not implemented"; } diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp index c519ca500afb1..eb87ee9bb7936 100644 --- a/paddle/math/Vector.cpp +++ b/paddle/math/Vector.cpp @@ -657,6 +657,8 @@ void CpuVectorT::copyFrom(const VectorT& src, hl_stream_t stream) { (void*)src.getData(), sizeof(T) * this->getSize(), stream); + // There is a need to add synchronization to ensure that the data is copied. + hl_stream_synchronize(stream); } else { src.copyTo(this); } diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h index 9af6e30c9e138..80b9775fccf10 100644 --- a/paddle/math/Vector.h +++ b/paddle/math/Vector.h @@ -168,11 +168,11 @@ class VectorT : public BaseVector { virtual void copyFrom(const VectorT& src) = 0; /** - * If use_gpu, this function will push the copy-task to the specifed-stream - * and return immediately. + * If GpuVector, this function is an asynchronous interface, + * will push the copy-task to the specifed-stream and return immediately. * - * If not use GPU, this function is same as - * the copyFrom(const VectorT& src), which use stream HPPL_STREAM_DEFAULT. + * If CpuVector, this function is an synchronous interface, + * same as the copyFrom(const VectorT& src). */ virtual void copyFrom(const VectorT& src, hl_stream_t stream) = 0; diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index 5a0dffe086c4e..354f58df39365 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -1127,4 +1127,18 @@ TEST(Matrix, MaxOutFwdBwd) { } } +TEST(CpuMatrix, copyFrom) { + const size_t height = 1000; + const size_t width = 1000; + CpuMatrix cpu(height, width); + GpuMatrix gpu(height, width); + CpuMatrix copy(height, width); + + cpu.randomizeUniform(); + gpu.copyFrom(cpu); + copy.copyFrom(gpu, HPPL_STREAM_DEFAULT); + + TensorCheckEqual(cpu, copy); +} + #endif