From dee48d8f91acf92d806242c3ca20aa2fe2436dbf Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Wed, 21 Feb 2018 18:05:17 -0800 Subject: [PATCH 01/14] test cpu float16 data transform --- paddle/fluid/framework/data_transform.cc | 1 + paddle/fluid/framework/data_type.h | 10 +- paddle/fluid/framework/data_type_transform.cc | 13 +- .../framework/data_type_transform_test.cc | 121 ++++++++++++++++++ 4 files changed, 143 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 0475fc1d9aaed..bfad9ac1e9cad 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type, PassTensorData(&out, &in); } + // do data type transform if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) { TransDataType(kernel_type_for_var, expected_kernel_type, in, &out); transformed = true; diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index 1dec766a345d8..4c1b3e7581fe7 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -16,13 +16,16 @@ limitations under the License. */ #include #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace framework { inline proto::VarType::Type ToDataType(std::type_index type) { using namespace paddle::framework::proto; - if (typeid(float).hash_code() == type.hash_code()) { + if (typeid(platform::float16).hash_code() == type.hash_code()) { + return proto::VarType::FP16; + } else if (typeid(float).hash_code() == type.hash_code()) { return proto::VarType::FP32; } else if (typeid(double).hash_code() == type.hash_code()) { return proto::VarType::FP64; @@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) { inline std::type_index ToTypeIndex(proto::VarType::Type type) { using namespace paddle::framework::proto; switch (type) { + case proto::VarType::FP16: + return typeid(platform::float16); case proto::VarType::FP32: return typeid(float); case proto::VarType::FP64: @@ -59,6 +64,9 @@ template inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { using namespace paddle::framework::proto; switch (type) { + case proto::VarType::FP16: + visitor.template operator()(); + break; case proto::VarType::FP32: visitor.template operator()(); break; diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 54cc1575d8802..4087180f1899a 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -47,9 +47,16 @@ struct CastDataType { auto* context = static_cast(ctx_); trans(*context, in_begin, in_end, out_begin, CastDataTypeFunctor()); +#ifdef PADDLE_WITH_CUDA + } else if (platform::is_gpu_place(in_.place())) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); +#endif } else { // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type? - PADDLE_THROW("Unsupport CPU <-> GPU!"); + PADDLE_THROW("Unsupported place!"); } } }; @@ -65,6 +72,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var, auto ctx = pool.Get(in.place()); switch (src_type) { + case proto::VarType::FP16: + framework::VisitDataType(dst_type, + CastDataType(in, out, ctx)); + break; case proto::VarType::FP32: framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); break; diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc index 724c8c301f25c..94f0cf1adbb35 100644 --- a/paddle/fluid/framework/data_type_transform_test.cc +++ b/paddle/fluid/framework/data_type_transform_test.cc @@ -51,3 +51,124 @@ TEST(DataTypeTransform, CPUTransform) { ASSERT_EQ(out_data_int[i], static_cast(i / 3)); } } + +TEST(DataTypeTransform, CPUFloat16) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_int32 = OpKernelType(proto::VarType::INT32, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_int64 = OpKernelType(proto::VarType::INT64, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_bool = OpKernelType(proto::VarType::BOOL, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + + auto place = CPUPlace(); + + Tensor in; + Tensor out; + + float16* ptr = in.mutable_data(make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i; + } + + // transform from float16 to other data types + TransDataType(kernel_fp16, kernel_fp32, in, &out); + float* out_data_float = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_float[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_fp64, in, &out); + float* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_double[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int32, in, &out); + float* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int64, in, &out); + float* out_data_int64 = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int64[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_bool, in, &out); + float* out_data_bool = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_bool[i], static_cast(ptr[i])); + } + + // transform float to float16 + float* in_data_float = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_float[i] = i; + } + + TransDataType(kernel_fp32, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); + } + + // transform double to float16 + double* in_data_double = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_double[i] = i; + } + + TransDataType(kernel_fp64, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); + } + + // transform int to float16 + int* in_data_int = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int[i] = i; + } + + TransDataType(kernel_int32, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); + } + + // transform int64 to float16 + int64_t* in_data_int64 = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int64[i] = i; + } + + TransDataType(kernel_int64, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); + } + + // transform bool to float16 + bool* in_data_bool = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_bool[i] = i; + } + + TransDataType(kernel_bool, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); + } +} From 1f8ec845001b7e19943210a5809a042385866065 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Wed, 21 Feb 2018 20:15:35 -0800 Subject: [PATCH 02/14] add isnan etc --- paddle/fluid/platform/float16.h | 42 +++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index cf6a4b09dbd2d..8bea214119076 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -734,6 +734,22 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { } #endif +HOSTDEVICE inline bool(isinf)(const float16& a) { + return (a.x & 0x7fff) == 0x7c00; +} + +HOSTDEVICE inline bool(isnan)(const float16& a) { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hisnan(half(a)); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} + +HOSTDEVICE inline bool(isfinite)(const float16& a) { + return !((isinf)(a)) && !((isnan)(a)); +} + } // namespace platform } // namespace paddle @@ -755,3 +771,29 @@ struct is_pod { }; } // namespace std + +#ifdef __CUDA_ARCH__ +namespace Eigen { +namespace numext { + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)( + const paddle::platform::float16& a) { + return (paddle::platform::float16::isnan)(a); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)( + const paddle::platform::float16& a) { + return (paddle::platform::float16::isinf)(h); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)( + const paddle::platform::float16& a) { + return (paddle::platform::float16::isfinite)(h); +} + +} // namespace numext +} // namespace Eigen +#endif // __CUDA_ARCH__ From 15a664199e45186f7056a91cd08941a99dbcf97e Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Wed, 21 Feb 2018 23:27:26 -0800 Subject: [PATCH 03/14] small fix --- paddle/fluid/framework/data_type_transform.cc | 2 +- paddle/fluid/framework/data_type_transform_test.cc | 12 ++++++------ paddle/fluid/operators/math/math_function.cc | 12 +++++++----- paddle/fluid/platform/float16.h | 6 +++--- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 4087180f1899a..489fe82204e1e 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -47,7 +47,7 @@ struct CastDataType { auto* context = static_cast(ctx_); trans(*context, in_begin, in_end, out_begin, CastDataTypeFunctor()); -#ifdef PADDLE_WITH_CUDA +#ifdef __NVCC__ } else if (platform::is_gpu_place(in_.place())) { platform::Transform trans; auto* context = static_cast(ctx_); diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc index 94f0cf1adbb35..a18de4337a158 100644 --- a/paddle/fluid/framework/data_type_transform_test.cc +++ b/paddle/fluid/framework/data_type_transform_test.cc @@ -56,6 +56,8 @@ TEST(DataTypeTransform, CPUFloat16) { using namespace paddle::framework; using namespace paddle::platform; + auto place = CPUPlace(); + auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place, DataLayout::kAnyLayout, LibraryType::kPlain); auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place, @@ -69,8 +71,6 @@ TEST(DataTypeTransform, CPUFloat16) { auto kernel_bool = OpKernelType(proto::VarType::BOOL, place, DataLayout::kAnyLayout, LibraryType::kPlain); - auto place = CPUPlace(); - Tensor in; Tensor out; @@ -89,25 +89,25 @@ TEST(DataTypeTransform, CPUFloat16) { } TransDataType(kernel_fp16, kernel_fp64, in, &out); - float* out_data_double = out.data(); + double* out_data_double = out.data(); for (int i = 0; i < data_number; ++i) { ASSERT_EQ(out_data_double[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_int32, in, &out); - float* out_data_int = out.data(); + int* out_data_int = out.data(); for (int i = 0; i < data_number; ++i) { ASSERT_EQ(out_data_int[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_int64, in, &out); - float* out_data_int64 = out.data(); + int64_t* out_data_int64 = out.data(); for (int i = 0; i < data_number; ++i) { ASSERT_EQ(out_data_int64[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_bool, in, &out); - float* out_data_bool = out.data(); + bool* out_data_bool = out.data(); for (int i = 0; i < data_number; ++i) { ASSERT_EQ(out_data_bool[i], static_cast(ptr[i])); } diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 41eab3ade207a..f7f33917d7ef5 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -245,11 +245,13 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ template struct Transpose; DEFINE_CPU_TRANS(1); diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 8bea214119076..ae27c9ec63a9a 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -779,19 +779,19 @@ namespace numext { template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)( const paddle::platform::float16& a) { - return (paddle::platform::float16::isnan)(a); + return (paddle::platform::isnan)(a); } template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)( const paddle::platform::float16& a) { - return (paddle::platform::float16::isinf)(h); + return (paddle::platform::isinf)(a); } template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)( const paddle::platform::float16& a) { - return (paddle::platform::float16::isfinite)(h); + return (paddle::platform::isfinite)(a); } } // namespace numext From 088c75a6a76e195041f5621fdc0243d63d294021 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 26 Feb 2018 20:57:47 -0800 Subject: [PATCH 04/14] fix containsNAN test error --- paddle/fluid/framework/tensor_util_test.cc | 56 ++++++++++++++------ paddle/fluid/framework/tensor_util_test.cu | 61 +++++++++++++++++----- paddle/fluid/platform/float16.h | 28 ++++++---- 3 files changed, 104 insertions(+), 41 deletions(-) diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index 8aebfcb3b624f..ca849ee9d90c8 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -235,27 +235,51 @@ TEST(TensorToVector, Tensor) { TEST(TensorContainsNAN, CPU) { using namespace paddle::framework; using namespace paddle::platform; - Tensor src; - float* buf = src.mutable_data({3}, CPUPlace()); - buf[0] = 0.0; - buf[1] = NAN; - buf[2] = 0.0; - ASSERT_TRUE(TensorContainsNAN(src)); - buf[1] = 0.0; - ASSERT_FALSE(TensorContainsNAN(src)); + { + Tensor src; + float* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 0.0; + buf[1] = NAN; + buf[2] = 0.0; + ASSERT_TRUE(TensorContainsNAN(src)); + buf[1] = 0.0; + ASSERT_FALSE(TensorContainsNAN(src)); + } + { + Tensor src; + float16* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 0.0; + buf[1].x = 0x7fff; + buf[2] = 0.0; + ASSERT_TRUE(TensorContainsNAN(src)); + buf[1] = 0.0; + ASSERT_FALSE(TensorContainsNAN(src)); + } } TEST(TensorContainsInf, CPU) { using namespace paddle::framework; using namespace paddle::platform; - Tensor src; - double* buf = src.mutable_data({3}, CPUPlace()); - buf[0] = 1.0; - buf[1] = INFINITY; - buf[2] = 0.0; - ASSERT_TRUE(TensorContainsInf(src)); - buf[1] = 1.0; - ASSERT_FALSE(TensorContainsInf(src)); + { + Tensor src; + double* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 1.0; + buf[1] = INFINITY; + buf[2] = 0.0; + ASSERT_TRUE(TensorContainsInf(src)); + buf[1] = 1.0; + ASSERT_FALSE(TensorContainsInf(src)); + } + { + Tensor src; + float16* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 1.0; + buf[1].x = 0x7c00; + buf[2] = 0.0; + ASSERT_TRUE(TensorContainsInf(src)); + buf[1] = 1.0; + ASSERT_FALSE(TensorContainsInf(src)); + } } TEST(Tensor, FromAndToStream) { diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu index d630ec44a2aa6..4766ec28aa3cf 100644 --- a/paddle/fluid/framework/tensor_util_test.cu +++ b/paddle/fluid/framework/tensor_util_test.cu @@ -25,32 +25,65 @@ static __global__ void FillNAN(float* buf) { buf[1] = 0.1; buf[2] = NAN; } + static __global__ void FillInf(float* buf) { buf[0] = 0.0; buf[1] = INFINITY; buf[2] = 0.5; } +static __global__ void FillNAN(platform::float16* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2].x = 0x7fff; +} + +static __global__ void FillInf(platform::float16* buf) { + buf[0] = 0.0; + buf[1].x = 0x7c00; + buf[2] = 0.5; +} + TEST(TensorContainsNAN, GPU) { - Tensor tensor; - platform::CUDAPlace gpu(0); - auto& pool = platform::DeviceContextPool::Instance(); + using namespace paddle::platform; + CUDAPlace gpu(0); + auto& pool = DeviceContextPool::Instance(); auto* cuda_ctx = pool.GetByPlace(gpu); - float* buf = tensor.mutable_data({3}, gpu); - FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); - cuda_ctx->Wait(); - ASSERT_TRUE(TensorContainsNAN(tensor)); + { + Tensor tensor; + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsNAN(tensor)); + } + { + Tensor tensor; + float16* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsNAN(tensor)); + } } TEST(TensorContainsInf, GPU) { - Tensor tensor; - platform::CUDAPlace gpu(0); - auto& pool = platform::DeviceContextPool::Instance(); + using namespace paddle::platform; + CUDAPlace gpu(0); + auto& pool = DeviceContextPool::Instance(); auto* cuda_ctx = pool.GetByPlace(gpu); - float* buf = tensor.mutable_data({3}, gpu); - FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); - cuda_ctx->Wait(); - ASSERT_TRUE(TensorContainsInf(tensor)); + { + Tensor tensor; + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsInf(tensor)); + } + { + Tensor tensor; + float16* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsInf(tensor)); + } } } // namespace framework diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 202527f83a8e1..52fb8c2531357 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -20,10 +20,6 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA -#include "unsupported/Eigen/CXX11/Tensor" - -#include "paddle/fluid/platform/hostdevice.h" - #ifdef __GNUC__ #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) #else @@ -64,6 +60,18 @@ limitations under the License. */ namespace paddle { namespace platform { +// Forward declare float16 for eigen.h +struct float16; + +} // namespace platform +} // namespace paddle + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace platform { + // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated // and aligned at least on a 2-byte boundary, which leads to efficient // memory access of float16 struct and also makes float16 compatible @@ -729,10 +737,6 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { } #endif -HOSTDEVICE inline bool(isinf)(const float16& a) { - return (a.x & 0x7fff) == 0x7c00; -} - HOSTDEVICE inline bool(isnan)(const float16& a) { #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hisnan(half(a)); @@ -741,8 +745,12 @@ HOSTDEVICE inline bool(isnan)(const float16& a) { #endif } +HOSTDEVICE inline bool(isinf)(const float16& a) { + return (a.x & 0x7fff) == 0x7c00; +} + HOSTDEVICE inline bool(isfinite)(const float16& a) { - return !((isinf)(a)) && !((isnan)(a)); + return !((isnan)(a)) && !((isinf)(a)); } } // namespace platform @@ -767,7 +775,6 @@ struct is_pod { } // namespace std -#ifdef __CUDA_ARCH__ namespace Eigen { namespace numext { @@ -791,4 +798,3 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)( } // namespace numext } // namespace Eigen -#endif // __CUDA_ARCH__ From 331e4ba7c73eb5c30aa31fa664459e52109dfdde Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 26 Feb 2018 21:54:37 -0800 Subject: [PATCH 05/14] add data_type transform GPU test --- paddle/fluid/framework/data_type_transform.cc | 1 - .../framework/data_type_transform_test.cc | 262 +++++++++--------- .../framework/data_type_transform_test.cu | 74 +++++ paddle/fluid/framework/tensor_util_test.cc | 2 + 4 files changed, 203 insertions(+), 136 deletions(-) create mode 100644 paddle/fluid/framework/data_type_transform_test.cu diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 489fe82204e1e..554cd58916c5a 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -55,7 +55,6 @@ struct CastDataType { CastDataTypeFunctor()); #endif } else { - // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type? PADDLE_THROW("Unsupported place!"); } } diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc index a18de4337a158..c992cba9a3611 100644 --- a/paddle/fluid/framework/data_type_transform_test.cc +++ b/paddle/fluid/framework/data_type_transform_test.cc @@ -22,42 +22,6 @@ TEST(DataTypeTransform, CPUTransform) { auto place = CPUPlace(); - Tensor in; - Tensor out; - - float* ptr = in.mutable_data(make_ddim({2, 3}), place); - int data_number = 2 * 3; - - for (int i = 0; i < data_number; ++i) { - ptr[i] = i / 3; - } - - auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place, - DataLayout::kAnyLayout, LibraryType::kPlain); - auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place, - DataLayout::kAnyLayout, LibraryType::kPlain); - auto kernel_int32 = OpKernelType(proto::VarType::INT32, place, - DataLayout::kAnyLayout, LibraryType::kPlain); - - TransDataType(kernel_fp32, kernel_fp64, in, &out); - double* out_data_double = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(i / 3)); - } - - TransDataType(kernel_fp32, kernel_int32, in, &out); - int* out_data_int = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(i / 3)); - } -} - -TEST(DataTypeTransform, CPUFloat16) { - using namespace paddle::framework; - using namespace paddle::platform; - - auto place = CPUPlace(); - auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place, DataLayout::kAnyLayout, LibraryType::kPlain); auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place, @@ -71,104 +35,132 @@ TEST(DataTypeTransform, CPUFloat16) { auto kernel_bool = OpKernelType(proto::VarType::BOOL, place, DataLayout::kAnyLayout, LibraryType::kPlain); - Tensor in; - Tensor out; - - float16* ptr = in.mutable_data(make_ddim({2, 3}), place); - int data_number = 2 * 3; - - for (int i = 0; i < data_number; ++i) { - ptr[i] = i; - } - - // transform from float16 to other data types - TransDataType(kernel_fp16, kernel_fp32, in, &out); - float* out_data_float = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_float[i], static_cast(ptr[i])); - } - - TransDataType(kernel_fp16, kernel_fp64, in, &out); - double* out_data_double = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(ptr[i])); - } - - TransDataType(kernel_fp16, kernel_int32, in, &out); - int* out_data_int = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(ptr[i])); - } - - TransDataType(kernel_fp16, kernel_int64, in, &out); - int64_t* out_data_int64 = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int64[i], static_cast(ptr[i])); - } - - TransDataType(kernel_fp16, kernel_bool, in, &out); - bool* out_data_bool = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_bool[i], static_cast(ptr[i])); - } - - // transform float to float16 - float* in_data_float = in.mutable_data(make_ddim({2, 3}), place); - for (int i = 0; i < data_number; ++i) { - in_data_float[i] = i; - } - - TransDataType(kernel_fp32, kernel_fp16, in, &out); - ptr = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); - } - - // transform double to float16 - double* in_data_double = in.mutable_data(make_ddim({2, 3}), place); - for (int i = 0; i < data_number; ++i) { - in_data_double[i] = i; - } - - TransDataType(kernel_fp64, kernel_fp16, in, &out); - ptr = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); - } - - // transform int to float16 - int* in_data_int = in.mutable_data(make_ddim({2, 3}), place); - for (int i = 0; i < data_number; ++i) { - in_data_int[i] = i; - } - - TransDataType(kernel_int32, kernel_fp16, in, &out); - ptr = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); - } - - // transform int64 to float16 - int64_t* in_data_int64 = in.mutable_data(make_ddim({2, 3}), place); - for (int i = 0; i < data_number; ++i) { - in_data_int64[i] = i; - } - - TransDataType(kernel_int64, kernel_fp16, in, &out); - ptr = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); - } - - // transform bool to float16 - bool* in_data_bool = in.mutable_data(make_ddim({2, 3}), place); - for (int i = 0; i < data_number; ++i) { - in_data_bool[i] = i; - } - - TransDataType(kernel_bool, kernel_fp16, in, &out); - ptr = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); + // data type transform from float32 + { + Tensor in; + Tensor out; + + float* ptr = in.mutable_data(make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i / 3; + } + + TransDataType(kernel_fp32, kernel_fp64, in, &out); + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_double[i], static_cast(i / 3)); + } + + TransDataType(kernel_fp32, kernel_int32, in, &out); + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int[i], static_cast(i / 3)); + } + } + + // data type transform from/to float16 + { + Tensor in; + Tensor out; + + float16* ptr = in.mutable_data(make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i; + } + + // transform from float16 to other data types + TransDataType(kernel_fp16, kernel_fp32, in, &out); + float* out_data_float = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_float[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_fp64, in, &out); + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_double[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int32, in, &out); + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int64, in, &out); + int64_t* out_data_int64 = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int64[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_bool, in, &out); + bool* out_data_bool = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_bool[i], static_cast(ptr[i])); + } + + // transform float to float16 + float* in_data_float = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_float[i] = i; + } + + TransDataType(kernel_fp32, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); + } + + // transform double to float16 + double* in_data_double = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_double[i] = i; + } + + TransDataType(kernel_fp64, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); + } + + // transform int to float16 + int* in_data_int = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int[i] = i; + } + + TransDataType(kernel_int32, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); + } + + // transform int64 to float16 + int64_t* in_data_int64 = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int64[i] = i; + } + + TransDataType(kernel_int64, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); + } + + // transform bool to float16 + bool* in_data_bool = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_bool[i] = i; + } + + TransDataType(kernel_bool, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); + } } } diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu new file mode 100644 index 0000000000000..1c9b619c213bb --- /dev/null +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" + +#include "gtest/gtest.h" + +TEST(DataTypeTransform, GPUTransform) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto cpu_place = CPUPlace(); + auto gpu_place = CUDAPlace(0); + CUDADeviceContext context(gpu_place); + + auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_int32 = OpKernelType(proto::VarType::INT32, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_int64 = OpKernelType(proto::VarType::INT64, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_bool = OpKernelType(proto::VarType::BOOL, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + + // data type transform from float32 + { + Tensor in; + Tensor in_gpu; + Tensor out_gpu; + Tensor out; + + float* in_ptr = in.mutable_data(make_ddim({2, 3}), cpu_place); + float arr[6] = {0, 1, 2, 3, 4, 5}; + int data_number = sizeof(arr) / sizeof(arr[0]); + memcpy(in_ptr, arr, sizeof(arr)); + + TensorCopy(in, gpu_place, context, &in_gpu); + + TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + double* out_data_double = out.data(); + context.Wait(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_double[i], static_cast(i)); + } + + TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + int* out_data_int = out.data(); + context.Wait(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int[i], static_cast(i)); + } + } + + // data type transform from/to float16 +} \ No newline at end of file diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index ca849ee9d90c8..9687a86ca25be 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -245,6 +245,7 @@ TEST(TensorContainsNAN, CPU) { buf[1] = 0.0; ASSERT_FALSE(TensorContainsNAN(src)); } + { Tensor src; float16* buf = src.mutable_data({3}, CPUPlace()); @@ -270,6 +271,7 @@ TEST(TensorContainsInf, CPU) { buf[1] = 1.0; ASSERT_FALSE(TensorContainsInf(src)); } + { Tensor src; float16* buf = src.mutable_data({3}, CPUPlace()); From 7a24c35bd8b5a0b5da2fc6b6c959dfbf2f8aa2cb Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 26 Feb 2018 23:37:27 -0800 Subject: [PATCH 06/14] add float16 GPU example --- paddle/fluid/framework/CMakeLists.txt | 1 + .../framework/data_type_transform_test.cu | 71 ++++++++++++++++--- 2 files changed, 63 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ef1bc07c2dbe7..7697babdf7f75 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -41,6 +41,7 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) +nv_test(data_type_transform_gpu_test SRCS data_type_transform_test.cu DEPS data_type_transform) cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function) cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform) diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu index 1c9b619c213bb..eda0ce01e04c6 100644 --- a/paddle/fluid/framework/data_type_transform_test.cu +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/tensor_util.h" #include "gtest/gtest.h" @@ -24,17 +25,17 @@ TEST(DataTypeTransform, GPUTransform) { auto gpu_place = CUDAPlace(0); CUDADeviceContext context(gpu_place); - auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place, + auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place, DataLayout::kAnyLayout, LibraryType::kPlain); - auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place, + auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place, DataLayout::kAnyLayout, LibraryType::kPlain); - auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place, + auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place, DataLayout::kAnyLayout, LibraryType::kPlain); - auto kernel_int32 = OpKernelType(proto::VarType::INT32, place, + auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place, DataLayout::kAnyLayout, LibraryType::kPlain); - auto kernel_int64 = OpKernelType(proto::VarType::INT64, place, + auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place, DataLayout::kAnyLayout, LibraryType::kPlain); - auto kernel_bool = OpKernelType(proto::VarType::BOOL, place, + auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place, DataLayout::kAnyLayout, LibraryType::kPlain); // data type transform from float32 @@ -48,7 +49,6 @@ TEST(DataTypeTransform, GPUTransform) { float arr[6] = {0, 1, 2, 3, 4, 5}; int data_number = sizeof(arr) / sizeof(arr[0]); memcpy(in_ptr, arr, sizeof(arr)); - TensorCopy(in, gpu_place, context, &in_gpu); TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu); @@ -57,7 +57,7 @@ TEST(DataTypeTransform, GPUTransform) { double* out_data_double = out.data(); context.Wait(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(i)); + ASSERT_EQ(out_data_double[i], static_cast(arr[i])); } TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu); @@ -66,9 +66,62 @@ TEST(DataTypeTransform, GPUTransform) { int* out_data_int = out.data(); context.Wait(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(i)); + ASSERT_EQ(out_data_int[i], static_cast(arr[i])); } } // data type transform from/to float16 + { + Tensor in; + Tensor in_gpu; + Tensor out_gpu; + Tensor out; + + float16* in_ptr = in.mutable_data(make_ddim({2, 3}), cpu_place); + float16 arr[6] = {0, 1, 2, 3, 4, 5}; + int data_number = sizeof(arr) / sizeof(arr[0]); + memcpy(in_ptr, arr, sizeof(arr)); + TensorCopy(in, gpu_place, context, &in_gpu); + + // transform from float16 to other data types + TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + float* out_data_float = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_float[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_double[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + int64_t* out_data_int64 = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_int64[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + bool* out_data_bool = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(out_data_bool[i], static_cast(ptr[i])); + } + } } \ No newline at end of file From e5cf3180baf74a045d9f6d031a1f5fdcc4ea44f8 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 26 Feb 2018 23:58:26 -0800 Subject: [PATCH 07/14] fix error --- .../framework/data_type_transform_test.cu | 92 +++++++++++++++++-- 1 file changed, 84 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu index eda0ce01e04c6..47f8165162e12 100644 --- a/paddle/fluid/framework/data_type_transform_test.cu +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -55,7 +55,7 @@ TEST(DataTypeTransform, GPUTransform) { TensorCopy(out_gpu, cpu_place, context, &out); double* out_data_double = out.data(); - context.Wait(); + // context.Wait(); for (int i = 0; i < data_number; ++i) { ASSERT_EQ(out_data_double[i], static_cast(arr[i])); } @@ -64,7 +64,7 @@ TEST(DataTypeTransform, GPUTransform) { TensorCopy(out_gpu, cpu_place, context, &out); int* out_data_int = out.data(); - context.Wait(); + // context.Wait(); for (int i = 0; i < data_number; ++i) { ASSERT_EQ(out_data_int[i], static_cast(arr[i])); } @@ -78,7 +78,8 @@ TEST(DataTypeTransform, GPUTransform) { Tensor out; float16* in_ptr = in.mutable_data(make_ddim({2, 3}), cpu_place); - float16 arr[6] = {0, 1, 2, 3, 4, 5}; + float16 arr[6] = {float16(0), float16(1), float16(2), + float16(3), float16(4), float16(5)}; int data_number = sizeof(arr) / sizeof(arr[0]); memcpy(in_ptr, arr, sizeof(arr)); TensorCopy(in, gpu_place, context, &in_gpu); @@ -89,7 +90,7 @@ TEST(DataTypeTransform, GPUTransform) { float* out_data_float = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_float[i], static_cast(ptr[i])); + ASSERT_EQ(out_data_float[i], static_cast(in_ptr[i])); } TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu); @@ -97,7 +98,7 @@ TEST(DataTypeTransform, GPUTransform) { double* out_data_double = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(ptr[i])); + ASSERT_EQ(out_data_double[i], static_cast(in_ptr[i])); } TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu); @@ -105,7 +106,7 @@ TEST(DataTypeTransform, GPUTransform) { int* out_data_int = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(ptr[i])); + ASSERT_EQ(out_data_int[i], static_cast(in_ptr[i])); } TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu); @@ -113,7 +114,7 @@ TEST(DataTypeTransform, GPUTransform) { int64_t* out_data_int64 = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int64[i], static_cast(ptr[i])); + ASSERT_EQ(out_data_int64[i], static_cast(in_ptr[i])); } TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu); @@ -121,7 +122,82 @@ TEST(DataTypeTransform, GPUTransform) { bool* out_data_bool = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_bool[i], static_cast(ptr[i])); + ASSERT_EQ(out_data_bool[i], static_cast(in_ptr[i])); + } + + // transform float to float16 + float* in_data_float = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_float[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + in_ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(in_ptr[i].x, static_cast(in_data_float[i]).x); + } + + // transform double to float16 + double* in_data_double = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_double[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + in_ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(in_ptr[i].x, static_cast(in_data_double[i]).x); + } + + // transform int to float16 + int* in_data_int = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + in_ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(in_ptr[i].x, static_cast(in_data_int[i]).x); + } + + // transform int64 to float16 + int64_t* in_data_int64 = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int64[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + in_ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(in_ptr[i].x, static_cast(in_data_int64[i]).x); + } + + // transform bool to float16 + bool* in_data_bool = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_bool[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + + in_ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + ASSERT_EQ(in_ptr[i].x, static_cast(in_data_bool[i]).x); } } } \ No newline at end of file From 187472899eb77fb62771d65a3db46ea65468a10b Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Tue, 27 Feb 2018 11:00:40 -0800 Subject: [PATCH 08/14] fix GPU test error --- paddle/fluid/framework/CMakeLists.txt | 16 +++-- paddle/fluid/framework/data_type_transform.cu | 1 + .../framework/data_type_transform_test.cu | 58 +++++++++---------- 3 files changed, 40 insertions(+), 35 deletions(-) create mode 120000 paddle/fluid/framework/data_type_transform.cu diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 7697babdf7f75..a73938d1ddbde 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) -if (WITH_GPU) +if(WITH_GPU) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto) else() cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto) -endif () +endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) -if (WITH_GPU) +if(WITH_GPU) nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor) else() cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) @@ -39,9 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) nv_test(data_device_transform_test SRCS data_device_transform_test.cu DEPS operator op_registry init math_function) -cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) -cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) -nv_test(data_type_transform_gpu_test SRCS data_type_transform_test.cu DEPS data_type_transform) +if(WITH_GPU) + nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) + nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) +else() + cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) + cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) +endif() cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function) cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform) diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu new file mode 120000 index 0000000000000..f46491293ef4a --- /dev/null +++ b/paddle/fluid/framework/data_type_transform.cu @@ -0,0 +1 @@ +data_type_transform.cc \ No newline at end of file diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu index 47f8165162e12..e9f8582065297 100644 --- a/paddle/fluid/framework/data_type_transform_test.cu +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -55,7 +55,6 @@ TEST(DataTypeTransform, GPUTransform) { TensorCopy(out_gpu, cpu_place, context, &out); double* out_data_double = out.data(); - // context.Wait(); for (int i = 0; i < data_number; ++i) { ASSERT_EQ(out_data_double[i], static_cast(arr[i])); } @@ -64,7 +63,6 @@ TEST(DataTypeTransform, GPUTransform) { TensorCopy(out_gpu, cpu_place, context, &out); int* out_data_int = out.data(); - // context.Wait(); for (int i = 0; i < data_number; ++i) { ASSERT_EQ(out_data_int[i], static_cast(arr[i])); } @@ -77,11 +75,11 @@ TEST(DataTypeTransform, GPUTransform) { Tensor out_gpu; Tensor out; - float16* in_ptr = in.mutable_data(make_ddim({2, 3}), cpu_place); + float16* ptr = in.mutable_data(make_ddim({2, 3}), cpu_place); float16 arr[6] = {float16(0), float16(1), float16(2), float16(3), float16(4), float16(5)}; int data_number = sizeof(arr) / sizeof(arr[0]); - memcpy(in_ptr, arr, sizeof(arr)); + memcpy(ptr, arr, sizeof(arr)); TensorCopy(in, gpu_place, context, &in_gpu); // transform from float16 to other data types @@ -90,7 +88,7 @@ TEST(DataTypeTransform, GPUTransform) { float* out_data_float = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_float[i], static_cast(in_ptr[i])); + ASSERT_EQ(out_data_float[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu); @@ -98,7 +96,7 @@ TEST(DataTypeTransform, GPUTransform) { double* out_data_double = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(in_ptr[i])); + ASSERT_EQ(out_data_double[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu); @@ -106,7 +104,7 @@ TEST(DataTypeTransform, GPUTransform) { int* out_data_int = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(in_ptr[i])); + ASSERT_EQ(out_data_int[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu); @@ -114,7 +112,7 @@ TEST(DataTypeTransform, GPUTransform) { int64_t* out_data_int64 = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int64[i], static_cast(in_ptr[i])); + ASSERT_EQ(out_data_int64[i], static_cast(ptr[i])); } TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu); @@ -122,11 +120,11 @@ TEST(DataTypeTransform, GPUTransform) { bool* out_data_bool = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_bool[i], static_cast(in_ptr[i])); + ASSERT_EQ(out_data_bool[i], static_cast(ptr[i])); } // transform float to float16 - float* in_data_float = in.mutable_data(make_ddim({2, 3}), place); + float* in_data_float = in.mutable_data(make_ddim({2, 3}), cpu_place); for (int i = 0; i < data_number; ++i) { in_data_float[i] = i; } @@ -135,69 +133,71 @@ TEST(DataTypeTransform, GPUTransform) { TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); - in_ptr = out.data(); + ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(in_ptr[i].x, static_cast(in_data_float[i]).x); + ASSERT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); } // transform double to float16 - double* in_data_double = in.mutable_data(make_ddim({2, 3}), place); + double* in_data_double = + in.mutable_data(make_ddim({2, 3}), cpu_place); for (int i = 0; i < data_number; ++i) { in_data_double[i] = i; } TensorCopy(in, gpu_place, context, &in_gpu); - TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); - in_ptr = out.data(); + ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(in_ptr[i].x, static_cast(in_data_double[i]).x); + ASSERT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); } // transform int to float16 - int* in_data_int = in.mutable_data(make_ddim({2, 3}), place); + int* in_data_int = in.mutable_data(make_ddim({2, 3}), cpu_place); for (int i = 0; i < data_number; ++i) { in_data_int[i] = i; } TensorCopy(in, gpu_place, context, &in_gpu); - TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); - in_ptr = out.data(); + ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(in_ptr[i].x, static_cast(in_data_int[i]).x); + ASSERT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); } // transform int64 to float16 - int64_t* in_data_int64 = in.mutable_data(make_ddim({2, 3}), place); + int64_t* in_data_int64 = + in.mutable_data(make_ddim({2, 3}), cpu_place); for (int i = 0; i < data_number; ++i) { in_data_int64[i] = i; } TensorCopy(in, gpu_place, context, &in_gpu); - TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); - in_ptr = out.data(); + ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(in_ptr[i].x, static_cast(in_data_int64[i]).x); + ASSERT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); } // transform bool to float16 - bool* in_data_bool = in.mutable_data(make_ddim({2, 3}), place); + bool* in_data_bool = in.mutable_data(make_ddim({2, 3}), cpu_place); for (int i = 0; i < data_number; ++i) { in_data_bool[i] = i; } TensorCopy(in, gpu_place, context, &in_gpu); - TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu); TensorCopy(out_gpu, cpu_place, context, &out); - in_ptr = out.data(); + ptr = out.data(); for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(in_ptr[i].x, static_cast(in_data_bool[i]).x); + ASSERT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); } } -} \ No newline at end of file +} From 897298927dd8fd58424e8682b97bd3dc47bd7da8 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Wed, 28 Feb 2018 17:34:44 -0800 Subject: [PATCH 09/14] initial commit --- paddle/fluid/operators/math/math_function.cc | 39 +++++++ paddle/fluid/operators/math/math_function.cu | 108 ++++++++++++++++++ .../operators/math/math_function_test.cu | 73 +++++++++--- paddle/fluid/platform/dynload/cublas.h | 3 + 4 files changed, 206 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index f7f33917d7ef5..d6f9b20501d09 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -15,11 +15,23 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { namespace math { +using float16 = paddle::platform::float16; + +template <> +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C) { + PADDLE_THROW("float16 GEMM not supported on CPU"); +} + template <> void gemm( const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, @@ -46,6 +58,15 @@ void gemm( beta, C, ldc); } +template <> +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const int lda, const float16* B, + const int ldb, const float16 beta, float16* C, const int ldc) { + PADDLE_THROW("float16 GEMM not supported on CPU"); +} + template <> void gemm( const platform::CPUDeviceContext& context, const bool transA, @@ -68,6 +89,15 @@ void gemm( lda, B, ldb, beta, C, ldc); } +template <> +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, + framework::Tensor* matrix_out, float beta) { + PADDLE_THROW("float16 matmul not supported on CPU"); +} + template <> void matmul( const platform::CPUDeviceContext& context, @@ -126,6 +156,15 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C, const int batchCount, const int strideA, const int strideB) { + PADDLE_THROW("float16 batched_gemm not supported on CPU"); +} + #ifdef PADDLE_WITH_MKLML // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. template <> diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index f8d0349ac5cd9..36655508be2ea 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -16,11 +16,40 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { namespace math { +using float16 = paddle::platform::float16; + +template <> +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + PADDLE_ENFORCE(platform::dynload::cublasHgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + h_A, lda, &h_beta, h_C, N)); +} + template <> void gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, @@ -60,6 +89,28 @@ void gemm( lda, &beta, C, N)); } +template <> +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const int lda, const float16* B, + const int ldb, const float16 beta, float16* C, const int ldc) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + PADDLE_ENFORCE(platform::dynload::cublasHgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + h_A, lda, &h_beta, h_C, ldc)); +} + template <> void gemm( const platform::CUDADeviceContext& context, const bool transA, @@ -90,6 +141,35 @@ void gemm( lda, &beta, C, ldc)); } +template <> +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float16 alpha, + framework::Tensor* matrix_out, float16 beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && + platform::is_gpu_place(matrix_b.place()) && + platform::is_gpu_place(matrix_out->place()), + "Matrix must all be in CUDAPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + template <> void matmul( const platform::CUDADeviceContext& context, @@ -148,6 +228,34 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +template <> +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); +} + template <> void batched_gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 207d6a87bce17..60495f12b389b 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -14,30 +14,33 @@ #include "gtest/gtest.h" #include "paddle/fluid/operators/math/math_function.h" -TEST(math_function, notrans_mul_trans) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor out_gpu; - paddle::framework::Tensor out; +TEST(math_function, notrans_mul_trans_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); - out_gpu.mutable_data({2, 2}, *gpu_place); + out_gpu.mutable_data({2, 2}, gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -45,7 +48,43 @@ TEST(math_function, notrans_mul_trans) { EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[2], 14); EXPECT_EQ(out_ptr[3], 50); - delete gpu_place; +} + +TEST(math_function, notrans_mul_trans_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + float16 arr[6] = {float16(0), float16(1), float16(2), + float16(3), float16(4), float16(5)}; + memcpy(input1_ptr, arr, 6 * sizeof(float16)); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({2, 2}, gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); + + TensorCopy(out_gpu, cpu_place, context, &out); + + float16* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(out_ptr[0], 5); + EXPECT_EQ(out_ptr[1], 14); + EXPECT_EQ(out_ptr[2], 14); + EXPECT_EQ(out_ptr[3], 50); } TEST(math_function, trans_mul_notrans) { diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 580ed9bb57fca..fa9041134d863 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -68,6 +68,8 @@ extern void *cublas_dso_handle; __macro(cublasDgemv_v2); \ __macro(cublasSgemm_v2); \ __macro(cublasDgemm_v2); \ + __macro(cublasHgemm); \ + __macro(cublasSgemmEx); \ __macro(cublasSgeam_v2); \ __macro(cublasDgeam_v2); \ __macro(cublasCreate_v2); \ @@ -83,6 +85,7 @@ extern void *cublas_dso_handle; __macro(cublasDgemmStridedBatched); \ __macro(cublasCgemmStridedBatched); \ __macro(cublasZgemmStridedBatched); \ + __macro(cublasHgemmStridedBatched); \ __macro(cublasSgetrfBatched); \ __macro(cublasSgetriBatched); \ __macro(cublasDgetrfBatched); \ From 046ab2822db34315c97f60659cb53b8e22dc63bb Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Wed, 28 Feb 2018 17:47:04 -0800 Subject: [PATCH 10/14] fix error --- paddle/fluid/operators/math/math_function.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index d6f9b20501d09..35d251f71a0cb 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -93,8 +93,8 @@ template <> void matmul( const platform::CPUDeviceContext& context, const framework::Tensor& matrix_a, bool trans_a, - const framework::Tensor& matrix_b, bool trans_b, float alpha, - framework::Tensor* matrix_out, float beta) { + const framework::Tensor& matrix_b, bool trans_b, float16 alpha, + framework::Tensor* matrix_out, float16 beta) { PADDLE_THROW("float16 matmul not supported on CPU"); } @@ -157,11 +157,11 @@ void matmul( } template <> -void batched_gemm( +void batched_gemm( const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, - const float alpha, const float* A, const float* B, const float beta, - float* C, const int batchCount, const int strideA, const int strideB) { + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C, const int batchCount, const int strideA, const int strideB) { PADDLE_THROW("float16 batched_gemm not supported on CPU"); } From 5ed635bee557ed532416449bb58a27f7fbc9d2c5 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 1 Mar 2018 10:31:31 -0800 Subject: [PATCH 11/14] small fix --- paddle/fluid/operators/math/math_function_test.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 60495f12b389b..e2ae79ee13e65 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -75,16 +75,17 @@ TEST(math_function, notrans_mul_trans_fp16) { out_gpu.mutable_data({2, 2}, gpu_place); paddle::operators::math::matmul( - context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); + context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, + float16(0)); TensorCopy(out_gpu, cpu_place, context, &out); float16* out_ptr = out.data(); context.Wait(); - EXPECT_EQ(out_ptr[0], 5); - EXPECT_EQ(out_ptr[1], 14); - EXPECT_EQ(out_ptr[2], 14); - EXPECT_EQ(out_ptr[3], 50); + EXPECT_EQ(static_cast(out_ptr[0]), 5); + EXPECT_EQ(static_cast(out_ptr[1]), 14); + EXPECT_EQ(static_cast(out_ptr[2]), 14); + EXPECT_EQ(static_cast(out_ptr[3]), 50); } TEST(math_function, trans_mul_notrans) { From d235d8200f0ae18870bd9eadacd99537f9378e5c Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 1 Mar 2018 15:22:34 -0800 Subject: [PATCH 12/14] add more gemm fp16 tests --- .../operators/math/math_function_test.cu | 281 ++++++++++++++---- 1 file changed, 224 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index e2ae79ee13e65..f37f9a892fc7e 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -88,30 +88,33 @@ TEST(math_function, notrans_mul_trans_fp16) { EXPECT_EQ(static_cast(out_ptr[3]), 50); } -TEST(math_function, trans_mul_notrans) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor out_gpu; - paddle::framework::Tensor out; +TEST(math_function, trans_mul_notrans_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); - out_gpu.mutable_data({3, 3}, *gpu_place); + out_gpu.mutable_data({3, 3}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); + TensorCopy(out_gpu, *cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -124,45 +127,90 @@ TEST(math_function, trans_mul_notrans) { EXPECT_EQ(out_ptr[6], 15); EXPECT_EQ(out_ptr[7], 22); EXPECT_EQ(out_ptr[8], 29); - delete gpu_place; } -TEST(math_function, gemm_notrans_cublas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor input3_gpu; +TEST(math_function, trans_mul_notrans_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + float16 arr[6] = {float16(0), float16(1), float16(2), + float16(3), float16(4), float16(5)}; + memcpy(input1_ptr, arr, 6 * sizeof(float16)); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({3, 3}, gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, + float16(0)); + + TensorCopy(out_gpu, cpu_place, context, &out); + + float16* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(static_cast out_ptr[0], 9); + EXPECT_EQ(static_cast out_ptr[1], 12); + EXPECT_EQ(static_cast out_ptr[2], 15); + EXPECT_EQ(static_cast out_ptr[3], 12); + EXPECT_EQ(static_cast out_ptr[4], 17); + EXPECT_EQ(static_cast out_ptr[5], 22); + EXPECT_EQ(static_cast out_ptr[6], 15); + EXPECT_EQ(static_cast out_ptr[7], 22); + EXPECT_EQ(static_cast out_ptr[8], 29); +} + +TEST(math_function, gemm_notrans_cublas_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); int m = 2; int n = 3; int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr1[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({3, 4}, *cpu_place); + float* input2_ptr = input2.mutable_data({3, 4}, cpu_place); float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float* input3_ptr = input3.mutable_data({2, 4}, cpu_place); float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); - float* c = input3_gpu.mutable_data(*gpu_place); + float* c = input3_gpu.mutable_data(gpu_place); paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); + TensorCopy(input3_gpu, cpu_place, context, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -179,47 +227,111 @@ TEST(math_function, gemm_notrans_cublas) { EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); - delete gpu_place; } -TEST(math_function, gemm_trans_cublas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor input3_gpu; +TEST(math_function, gemm_notrans_cublas_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); int m = 2; int n = 3; int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + float16 arr1[6] = {float16(0), float16(1), float16(2), + float16(3), float16(4), float16(5)}; + memcpy(input1_ptr, arr1, 6 * sizeof(float16)); + float16* input2_ptr = input2.mutable_data({3, 4}, cpu_place); + float16 arr2[12] = {float16(0), float16(1), float16(2), float16(3), + float16(4), float16(5), float16(6), float16(7), + float16(8), float16(9), float16(10), float16(11)}; + memcpy(input2_ptr, arr2, 12 * sizeof(float16)); + float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + float16 arr3[8] = {float16(0), float16(1), float16(2), float16(3), + float16(4), float16(5), float16(6), float16(7)}; + memcpy(input3_ptr, arr3, 8 * sizeof(float16)); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); + float16* a = input1_gpu.data(); + float16* b = input2_gpu.data(); + float16* c = input3_gpu.mutable_data(gpu_place); + + paddle::operators::math::gemm( + context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), + c + 1, 4); + + TensorCopy(input3_gpu, cpu_place, context, &input3); + + // numpy code: + // a = np.arange(6).reshape(2, 3) + // b = np.arange(12).reshape(3, 4)[:, 1:] + // c = np.arange(8).reshape(2, 4)[:, 1:] + // out = np.arange(8).reshape(2, 4) + // out[:, 1:] = np.dot(a, b) + c + context.Wait(); + EXPECT_EQ(static_cast(input3_ptr[0]), 0); + EXPECT_EQ(static_cast(input3_ptr[1]), 24); + EXPECT_EQ(static_cast(input3_ptr[2]), 28); + EXPECT_EQ(static_cast(input3_ptr[3]), 32); + EXPECT_EQ(static_cast(input3_ptr[4]), 4); + EXPECT_EQ(static_cast(input3_ptr[5]), 73); + EXPECT_EQ(static_cast(input3_ptr[6]), 86); + EXPECT_EQ(static_cast(input3_ptr[7]), 99); +} + +TEST(math_function, gemm_trans_cublas_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + int m = 2; + int n = 3; + int k = 3; + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr1[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({4, 3}, *cpu_place); + float* input2_ptr = input2.mutable_data({4, 3}, cpu_place); float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float* input3_ptr = input3.mutable_data({2, 4}, cpu_place); float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); - float* c = input3_gpu.mutable_data(*gpu_place); + float* c = input3_gpu.mutable_data(gpu_place); paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); - context.Wait(); + TensorCopy(input3_gpu, cpu_place, context, &input3); + context.Wait(); EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[2], 28); @@ -228,7 +340,62 @@ TEST(math_function, gemm_trans_cublas) { EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); - delete gpu_place; +} + +TEST(math_function, gemm_trans_cublas_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + int m = 2; + int n = 3; + int k = 3; + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + float16 arr1[6] = {float16(0), float16(1), float16(2), + float16(3), float16(4), float16(5)}; + memcpy(input1_ptr, arr1, 6 * sizeof(float16)); + float16* input2_ptr = input2.mutable_data({4, 3}, cpu_place); + float16 arr2[12] = {float16(0), float16(4), float16(8), float16(1), + float16(5), float16(9), float16(2), float16(6), + float16(10), float16(3), float16(7), float16(11)}; + memcpy(input2_ptr, arr2, 12 * sizeof(float16)); + float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + float16 arr3[8] = {float16(0), float16(1), float16(2), float16(3), + float16(4), float16(5), float16(6), float16(7)}; + memcpy(input3_ptr, arr3, 8 * sizeof(float16)); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); + float16* a = input1_gpu.data(); + float16* b = input2_gpu.data(); + float16* c = input3_gpu.mutable_data(gpu_place); + + paddle::operators::math::gemm( + context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), + c + 1, 4); + + TensorCopy(input3_gpu, cpu_place, context, &input3); + + context.Wait(); + EXPECT_EQ(static_cast(input3_ptr[0]), 0); + EXPECT_EQ(static_cast(input3_ptr[1]), 24); + EXPECT_EQ(static_cast(input3_ptr[2]), 28); + EXPECT_EQ(static_cast(input3_ptr[3]), 32); + EXPECT_EQ(static_cast(input3_ptr[4]), 4); + EXPECT_EQ(static_cast(input3_ptr[5]), 73); + EXPECT_EQ(static_cast(input3_ptr[6]), 86); + EXPECT_EQ(static_cast(input3_ptr[7]), 99); } template From 81d489545c1ed1acaa983343482c44ab4ec7f2a3 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 1 Mar 2018 15:42:38 -0800 Subject: [PATCH 13/14] fix error --- .../operators/math/math_function_test.cu | 67 ++++++++++--------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index f37f9a892fc7e..f50050326cd86 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -114,7 +114,7 @@ TEST(math_function, trans_mul_notrans_fp32) { paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - TensorCopy(out_gpu, *cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -161,15 +161,15 @@ TEST(math_function, trans_mul_notrans_fp16) { float16* out_ptr = out.data(); context.Wait(); - EXPECT_EQ(static_cast out_ptr[0], 9); - EXPECT_EQ(static_cast out_ptr[1], 12); - EXPECT_EQ(static_cast out_ptr[2], 15); - EXPECT_EQ(static_cast out_ptr[3], 12); - EXPECT_EQ(static_cast out_ptr[4], 17); - EXPECT_EQ(static_cast out_ptr[5], 22); - EXPECT_EQ(static_cast out_ptr[6], 15); - EXPECT_EQ(static_cast out_ptr[7], 22); - EXPECT_EQ(static_cast out_ptr[8], 29); + EXPECT_EQ(static_cast(out_ptr[0]), 9); + EXPECT_EQ(static_cast(out_ptr[1]), 12); + EXPECT_EQ(static_cast(out_ptr[2]), 15); + EXPECT_EQ(static_cast(out_ptr[3]), 12); + EXPECT_EQ(static_cast(out_ptr[4]), 17); + EXPECT_EQ(static_cast(out_ptr[5]), 22); + EXPECT_EQ(static_cast(out_ptr[6]), 15); + EXPECT_EQ(static_cast(out_ptr[7]), 22); + EXPECT_EQ(static_cast(out_ptr[8]), 29); } TEST(math_function, gemm_notrans_cublas_fp32) { @@ -400,22 +400,27 @@ TEST(math_function, gemm_trans_cublas_fp16) { template void GemvTest(int m, int n, bool trans) { - paddle::framework::Tensor mat_a; - paddle::framework::Tensor vec_b; - paddle::framework::Tensor vec_c; - auto* cpu_place = new paddle::platform::CPUPlace(); - - T* data_a = mat_a.mutable_data({m, n}, *cpu_place); - T* data_b = vec_b.mutable_data({trans ? m : n}, *cpu_place); - T* data_c = vec_c.mutable_data({trans ? n : m}, *cpu_place); - - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::framework::Tensor g_mat_a; - paddle::framework::Tensor g_vec_b; - paddle::framework::Tensor g_vec_c; - T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), *gpu_place); - T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), *gpu_place); - T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), *gpu_place); + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor mat_a; + Tensor vec_b; + Tensor vec_c; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + T* data_a = mat_a.mutable_data({m, n}, cpu_place); + T* data_b = vec_b.mutable_data({trans ? m : n}, cpu_place); + T* data_c = vec_c.mutable_data({trans ? n : m}, cpu_place); + + Tensor g_mat_a; + Tensor g_vec_b; + Tensor g_vec_c; + T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), gpu_place); + T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), gpu_place); + T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), gpu_place); for (int i = 0; i < mat_a.numel(); ++i) { data_a[i] = static_cast(i); @@ -424,16 +429,14 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - paddle::platform::CUDADeviceContext context(*gpu_place); - paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a); - paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b); + TensorCopy(mat_a, gpu_place, context, &g_mat_a); + TensorCopy(vec_b, gpu_place, context, &g_vec_b); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context, - &vec_c); + TensorCopy(g_vec_c, cpu_place, context, &vec_c); if (!trans) { for (int i = 0; i < m; ++i) { From 7ec6749450356f6db77e97c0388f8859e4b62708 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Tue, 6 Mar 2018 20:19:48 -0800 Subject: [PATCH 14/14] add utility function --- .../operators/math/math_function_test.cu | 44 ++++++++----------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index f50050326cd86..442e62d563ebd 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -14,6 +14,14 @@ #include "gtest/gtest.h" #include "paddle/fluid/operators/math/math_function.h" +void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, + const std::vector& data) { + PADDLE_ENFORCE_EQ(size, data.size()); + for (size_t i = 0; i < data.size(); ++i) { + in_ptr[i] = paddle::platform::float16(data[i]); + } +} + TEST(math_function, notrans_mul_trans_fp32) { using namespace paddle::framework; using namespace paddle::platform; @@ -65,9 +73,7 @@ TEST(math_function, notrans_mul_trans_fp16) { CUDADeviceContext context(gpu_place); float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); - float16 arr[6] = {float16(0), float16(1), float16(2), - float16(3), float16(4), float16(5)}; - memcpy(input1_ptr, arr, 6 * sizeof(float16)); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input2_gpu); @@ -144,9 +150,7 @@ TEST(math_function, trans_mul_notrans_fp16) { CUDADeviceContext context(gpu_place); float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); - float16 arr[6] = {float16(0), float16(1), float16(2), - float16(3), float16(4), float16(5)}; - memcpy(input1_ptr, arr, 6 * sizeof(float16)); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input2_gpu); @@ -248,18 +252,12 @@ TEST(math_function, gemm_notrans_cublas_fp16) { int n = 3; int k = 3; float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); - float16 arr1[6] = {float16(0), float16(1), float16(2), - float16(3), float16(4), float16(5)}; - memcpy(input1_ptr, arr1, 6 * sizeof(float16)); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); float16* input2_ptr = input2.mutable_data({3, 4}, cpu_place); - float16 arr2[12] = {float16(0), float16(1), float16(2), float16(3), - float16(4), float16(5), float16(6), float16(7), - float16(8), float16(9), float16(10), float16(11)}; - memcpy(input2_ptr, arr2, 12 * sizeof(float16)); + fill_fp16_data(input2_ptr, input2.numel(), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); - float16 arr3[8] = {float16(0), float16(1), float16(2), float16(3), - float16(4), float16(5), float16(6), float16(7)}; - memcpy(input3_ptr, arr3, 8 * sizeof(float16)); + fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input2, gpu_place, context, &input2_gpu); @@ -361,18 +359,12 @@ TEST(math_function, gemm_trans_cublas_fp16) { int n = 3; int k = 3; float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); - float16 arr1[6] = {float16(0), float16(1), float16(2), - float16(3), float16(4), float16(5)}; - memcpy(input1_ptr, arr1, 6 * sizeof(float16)); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); float16* input2_ptr = input2.mutable_data({4, 3}, cpu_place); - float16 arr2[12] = {float16(0), float16(4), float16(8), float16(1), - float16(5), float16(9), float16(2), float16(6), - float16(10), float16(3), float16(7), float16(11)}; - memcpy(input2_ptr, arr2, 12 * sizeof(float16)); + fill_fp16_data(input2_ptr, input2.numel(), + {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); - float16 arr3[8] = {float16(0), float16(1), float16(2), float16(3), - float16(4), float16(5), float16(6), float16(7)}; - memcpy(input3_ptr, arr3, 8 * sizeof(float16)); + fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input2, gpu_place, context, &input2_gpu);