From 3a4301659beaebf3b89cbbdd2162fd18c122d527 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 15 Feb 2023 13:46:00 -0500 Subject: [PATCH 01/10] clean up --- example/04_gemm_add_add_fastgelu/common.hpp | 2 +- .../gemm_add_add_fastgelu_xdl_fp16.cpp | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/example/04_gemm_add_add_fastgelu/common.hpp b/example/04_gemm_add_add_fastgelu/common.hpp index 3f9375e0926..839587c1489 100644 --- a/example/04_gemm_add_add_fastgelu/common.hpp +++ b/example/04_gemm_add_add_fastgelu/common.hpp @@ -62,7 +62,7 @@ struct ExecutionConfig final }; inline bool -parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig config) +parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config) { if(argc == 1) { diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp index 6c7ca414448..ae3db1374f9 100644 --- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp +++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp @@ -7,10 +7,11 @@ using ADataType = F16; using BDataType = F16; using AccDataType = F32; using CShuffleDataType = F32; -using D0DataType = F16; -using D1DataType = F16; -using DsDataType = ck::Tuple; -using EDataType = F16; +using CDataType = F32; // C matrix doesn't exsit in memory, this is used for host verification +using D0DataType = F16; +using D1DataType = F16; +using DsDataType = ck::Tuple; +using EDataType = F16; using ALayout = Row; using BLayout = Col; @@ -36,7 +37,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm Date: Wed, 15 Feb 2023 15:49:46 -0500 Subject: [PATCH 02/10] fast gelu using builtin function --- .../gemm_add_add_fastgelu_xdl_fp16.cpp | 4 +- .../run_gemm_add_add_fastgelu_example.inc | 2 +- include/ck/ck.hpp | 3 + .../element/binary_element_wise_operation.hpp | 27 +++++++ .../gpu/element/element_wise_operation.hpp | 45 +++++++---- .../element/unary_element_wise_operation.hpp | 75 +++++++++++++++++++ 6 files changed, 139 insertions(+), 17 deletions(-) diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp index ae3db1374f9..487f60d385e 100644 --- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp +++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp @@ -6,8 +6,8 @@ using ADataType = F16; using BDataType = F16; using AccDataType = F32; -using CShuffleDataType = F32; -using CDataType = F32; // C matrix doesn't exsit in memory, this is used for host verification +using CShuffleDataType = F16; +using CDataType = F16; // C matrix doesn't exsit in GPU memory, this is used for host verification using D0DataType = F16; using D1DataType = F16; using DsDataType = ck::Tuple; diff --git a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc index f3def33b567..cb3147bcd71 100644 --- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc +++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc @@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC if(config.do_verification) { - Tensor c_m_n({M, N}); + Tensor c_m_n({M, N}); auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index ffd7e74f123..c6118fbad81 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -168,6 +168,9 @@ // tuning parameter #define CK_WORKAROUND_SWDEV_325164 0 +// workaround: compiler not emiting reciprocal instruction frm __frcp_rn() +#define CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN 1 + // flag to enable (1) or disable (0) the debugging output in some kernels #define DEBUG_LOG 0 diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index 69fa75c3fd3..cd761f08d4d 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -280,6 +280,7 @@ struct AddHardswish }; }; +#if 0 // C = A * B // E = FastGelu(C + D) struct AddFastGelu @@ -319,6 +320,32 @@ struct AddFastGelu e = GetFastGeLU(c + type_convert(d)); } }; +#else +// E = FastGelu(C + D) +struct AddFastGelu +{ + template + __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const; + + template <> + __host__ __device__ constexpr void + operator()(float& e, const float& c, const float& d) const + { + const float x = c + d; + + FastGelu{}.template operator()(e, x); + } + + template <> + __host__ __device__ constexpr void + operator()(half_t& e, const half_t& c, const half_t& d) const + { + const half_t x = c + d; + + ck::tensor_operation::element_wise::FastGelu{}.template operator()(e, x); + } +}; +#endif } // namespace element_wise } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 7f3d450a39d..7dd0d1cff16 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -16,7 +16,7 @@ namespace element_wise { // Need to ensure compiler will fail if there is no matching candidate, instead of compiler // siliently do implicit type conversion // -// Method 1: +// Example: // // struct ExampleElementwiseOp // { @@ -30,19 +30,6 @@ namespace element_wise { // { // } // }; -// -// Method 2: -// -// template -// struct ExampleElementwiseOp; -// -// template <> -// struct ExampleElementwiseOp -// { -// __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const -// { -// } -// }; struct AddReluAdd { @@ -208,6 +195,7 @@ struct AddMultiply } }; +#if 0 // C = A * B // E = FastGelu(C + D0 + D1) struct AddAddFastGelu @@ -245,6 +233,35 @@ struct AddAddFastGelu e = type_convert(y); } }; +#else +// E = FastGelu(C + D0 + D1) +struct AddAddFastGelu +{ + template + __host__ __device__ constexpr void + operator()(E& e, const C& c, const D0& d0, const D1& d1) const; + + template <> + __host__ __device__ constexpr void operator()(float& e, + const float& c, + const float& d0, + const float& d1) const + { + const float x = c + d0 + d1; + + FastGelu{}.template operator()(e, x); + } + + template <> + __host__ __device__ constexpr void operator()( + half_t& e, const half_t& c, const half_t& d0, const half_t& d1) const + { + const half_t x = c + d0 + d1; + + ck::tensor_operation::element_wise::FastGelu{}.template operator()(e, x); + } +}; +#endif struct Normalize { diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 2167a79e019..b9bd19abcc5 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -11,6 +11,10 @@ namespace ck { namespace tensor_operation { namespace element_wise { +#if CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN +extern "C" __device__ float __ocml_native_recip_f32(float); +#endif + struct PassThrough { template @@ -200,6 +204,7 @@ struct Relu } }; +#if 0 // Y = FastGelu(X) struct FastGelu { @@ -232,6 +237,76 @@ struct FastGelu y = type_convert(tmp_y); } }; +#else +// Fast GeLU +// https://paperswithcode.com/method/gelu +// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3))) +// host code use higher accuracy "exp" and "div" +// gpu code use lower accuracy "__expf" and "rcp" function +struct FastGelu +{ + template + __host__ void operator()(Y& y, const X& x) const; + + template + __device__ void operator()(Y& y, const X& x) const; + + template <> + __host__ void operator()(float& y, const float& x) const + { + const float u = 2.f * x * (0.035677f * x * x + 0.797885f); + const float emu = exp(-u); + const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f); + + y = x * cdf; + } + + template <> + __host__ void operator()(half_t& y, const half_t& x) const + { + float y_f; + + this->operator()(y_f, type_convert(x)); + + y = type_convert(y_f); + } + + // device code, use lower precision "__expf" and "rcp" + template <> + __device__ void operator()(float& y, const float& x) const + { +#if 0 + const float u = 2.f * x * (0.035677f * x * x + 0.797885f); + const float emu = exp(-u); + const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f); + + y = x * cdf; +#else + const float u = 2.f * x * (0.035677f * x * x + 0.797885f); + const float emu = __expf(-u); + +#if !CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN + const float cdf = 0.5f + 0.5f * (2.f * __frcp_rn(1.f + emu) - 1.f); +#else + const float cdf = 0.5f + 0.5f * (2.f * __ocml_native_recip_f32(1.f + emu) - 1.f); +#endif + + y = x * cdf; +#endif + } + + // device code, use lower precision "__expf" and "rcp" + template <> + __device__ void operator()(half_t& y, const half_t& x) const + { + float y_f; + + this->operator()(y_f, type_convert(x)); + + y = type_convert(y_f); + } +}; +#endif // https://paperswithcode.com/method/gelu // y = 0.5*x*(1+erf(x/sqrt(2))) From 87f44ead9cb11a668bc82cdb53dcbc8f8a1a7de2 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 15 Feb 2023 17:03:23 -0500 Subject: [PATCH 03/10] clean --- include/ck/ck.hpp | 2 +- .../element/binary_element_wise_operation.hpp | 42 ----------------- .../gpu/element/element_wise_operation.hpp | 40 ---------------- .../element/unary_element_wise_operation.hpp | 47 +------------------ 4 files changed, 3 insertions(+), 128 deletions(-) diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index c6118fbad81..1257a776493 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -169,7 +169,7 @@ #define CK_WORKAROUND_SWDEV_325164 0 // workaround: compiler not emiting reciprocal instruction frm __frcp_rn() -#define CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN 1 +#define CK_WORKAROUND_SWDEV_383542 1 // flag to enable (1) or disable (0) the debugging output in some kernels #define DEBUG_LOG 0 diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index cd761f08d4d..1d7b973806f 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -280,47 +280,6 @@ struct AddHardswish }; }; -#if 0 -// C = A * B -// E = FastGelu(C + D) -struct AddFastGelu -{ - // Fast GeLU - // https://paperswithcode.com/method/gelu - // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3))) - __host__ __device__ static constexpr float GetFastGeLU(float x) - { - const float u = 2.f * x * (0.035677f * x * x + 0.797885f); - const float emu = exp(-u); - const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f); - return x * cdf; - } - - template - static inline constexpr bool is_valid_param_type_v = - std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v; - - template - __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const - { - static_assert(is_valid_param_type_v && is_valid_param_type_v && - is_valid_param_type_v); - - const float y = GetFastGeLU(type_convert(c) + type_convert(d)); - - e = type_convert(y); - } - - template - __host__ __device__ constexpr void operator()(float& e, const float& c, const D& d) const - { - static_assert(is_valid_param_type_v); - - e = GetFastGeLU(c + type_convert(d)); - } -}; -#else // E = FastGelu(C + D) struct AddFastGelu { @@ -345,7 +304,6 @@ struct AddFastGelu ck::tensor_operation::element_wise::FastGelu{}.template operator()(e, x); } }; -#endif } // namespace element_wise } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 7dd0d1cff16..2f60dfd72d6 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -195,45 +195,6 @@ struct AddMultiply } }; -#if 0 -// C = A * B -// E = FastGelu(C + D0 + D1) -struct AddAddFastGelu -{ - // Fast GeLU - // https://paperswithcode.com/method/gelu - // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3))) - __host__ __device__ static constexpr float GetFastGeLU(float x) - { - const float u = 2.f * x * (0.035677f * x * x + 0.797885f); - const float emu = exp(-u); - const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f); - return x * cdf; - } - - template - static inline constexpr bool is_valid_param_type_v = - std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v -#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 - || std::is_same_v -#endif - ; - - template - __host__ __device__ constexpr void - operator()(E& e, const C& c, const D0& d0, const D1& d1) const - { - static_assert(is_valid_param_type_v && is_valid_param_type_v && - is_valid_param_type_v && is_valid_param_type_v); - - const float y = - GetFastGeLU(type_convert(c) + type_convert(d0) + type_convert(d1)); - - e = type_convert(y); - } -}; -#else // E = FastGelu(C + D0 + D1) struct AddAddFastGelu { @@ -261,7 +222,6 @@ struct AddAddFastGelu ck::tensor_operation::element_wise::FastGelu{}.template operator()(e, x); } }; -#endif struct Normalize { diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index b9bd19abcc5..803aedad396 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -11,7 +11,7 @@ namespace ck { namespace tensor_operation { namespace element_wise { -#if CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN +#if CK_WORKAROUND_SWDEV_383542 extern "C" __device__ float __ocml_native_recip_f32(float); #endif @@ -204,40 +204,6 @@ struct Relu } }; -#if 0 -// Y = FastGelu(X) -struct FastGelu -{ - // Fast GeLU - // https://paperswithcode.com/method/gelu - // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3))) - __host__ __device__ static constexpr float GetFastGeLU(float x) - { - const float u = 2.f * x * (0.035677f * x * x + 0.797885f); - const float emu = exp(-u); - const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f); - return x * cdf; - } - - template - static inline constexpr bool is_valid_param_type_v = - std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v || std::is_same_v -#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 - || std::is_same_v -#endif - ; - - template - __host__ __device__ void operator()(Y& y, const X& x) const - { - static_assert(is_valid_param_type_v && is_valid_param_type_v); - - const float tmp_y = GetFastGeLU(type_convert(x)); - y = type_convert(tmp_y); - } -}; -#else // Fast GeLU // https://paperswithcode.com/method/gelu // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3))) @@ -275,24 +241,16 @@ struct FastGelu template <> __device__ void operator()(float& y, const float& x) const { -#if 0 - const float u = 2.f * x * (0.035677f * x * x + 0.797885f); - const float emu = exp(-u); - const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f); - - y = x * cdf; -#else const float u = 2.f * x * (0.035677f * x * x + 0.797885f); const float emu = __expf(-u); -#if !CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN +#if !CK_WORKAROUND_SWDEV_383542 const float cdf = 0.5f + 0.5f * (2.f * __frcp_rn(1.f + emu) - 1.f); #else const float cdf = 0.5f + 0.5f * (2.f * __ocml_native_recip_f32(1.f + emu) - 1.f); #endif y = x * cdf; -#endif } // device code, use lower precision "__expf" and "rcp" @@ -306,7 +264,6 @@ struct FastGelu y = type_convert(y_f); } }; -#endif // https://paperswithcode.com/method/gelu // y = 0.5*x*(1+erf(x/sqrt(2))) From 58e0fa8abdf487e0e2220ed98bebcdbae2d82ed3 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 15 Feb 2023 19:39:45 -0500 Subject: [PATCH 04/10] clean --- .../gpu/element/binary_element_wise_operation.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index 1d7b973806f..09cce54e60f 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -4,6 +4,7 @@ #pragma once #include "ck/utility/data_type.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" namespace ck { namespace tensor_operation { From c390260d871bfd851e2874181320dd1292e14350 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 16 Feb 2023 13:18:48 -0500 Subject: [PATCH 05/10] clean --- .../gpu/element/binary_element_wise_operation.hpp | 14 ++++++++++++++ .../gpu/element/element_wise_operation.hpp | 14 ++++++++++++++ .../gpu/element/unary_element_wise_operation.hpp | 1 - 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index 09cce54e60f..79ab1e264d0 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -304,6 +304,20 @@ struct AddFastGelu ck::tensor_operation::element_wise::FastGelu{}.template operator()(e, x); } + + template <> + __host__ __device__ constexpr void + operator()(half_t& e, const float& c, const half_t& d) const + { + const float x0_f = c + d; + + float x1_f; + + ck::tensor_operation::element_wise::FastGelu{}.template operator()(x1_f, + x0_f); + + e = type_convert(x1_f); + } }; } // namespace element_wise diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 2f60dfd72d6..1586247d089 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -221,6 +221,20 @@ struct AddAddFastGelu ck::tensor_operation::element_wise::FastGelu{}.template operator()(e, x); } + + template <> + __host__ __device__ constexpr void operator()( + half_t& e, const float& c, const half_t& d0, const half_t& d1) const + { + const float x0_f = c + d0 + d1; + + float x1_f; + + ck::tensor_operation::element_wise::FastGelu{}.template operator()(x1_f, + x0_f); + + e = type_convert(x1_f); + } }; struct Normalize diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 803aedad396..ef9764979ed 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -253,7 +253,6 @@ struct FastGelu y = x * cdf; } - // device code, use lower precision "__expf" and "rcp" template <> __device__ void operator()(half_t& y, const half_t& x) const { From 90132e3da529f9fd744b99e373547d2a6130bd6a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 16 Feb 2023 14:08:36 -0500 Subject: [PATCH 06/10] clean: --- .../gpu/element/binary_element_wise_operation.hpp | 2 +- .../ck/tensor_operation/gpu/element/element_wise_operation.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp index 79ab1e264d0..136017c6d17 100644 --- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp @@ -311,7 +311,7 @@ struct AddFastGelu { const float x0_f = c + d; - float x1_f; + float x1_f = 0; ck::tensor_operation::element_wise::FastGelu{}.template operator()(x1_f, x0_f); diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 1586247d089..7021b1ccfbb 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -228,7 +228,7 @@ struct AddAddFastGelu { const float x0_f = c + d0 + d1; - float x1_f; + float x1_f = 0; ck::tensor_operation::element_wise::FastGelu{}.template operator()(x1_f, x0_f); From 6fd75022f1b0218a2d32636c0f6791e2800657df Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 16 Feb 2023 17:46:11 -0500 Subject: [PATCH 07/10] clean --- .../gpu/element/unary_element_wise_operation.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index ef9764979ed..4da4b31ad71 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -262,6 +262,16 @@ struct FastGelu y = type_convert(y_f); } + + template <> + __device__ void operator()(half_t& y, const float& x) const + { + float y_f; + + this->operator()(y_f, x); + + y = type_convert(y_f); + } }; // https://paperswithcode.com/method/gelu From d8552699d88d40600adf105012222050f58c245f Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 22 Feb 2023 14:42:40 -0500 Subject: [PATCH 08/10] fix compilation --- .../gemm_add_add_fastgelu_xdl_bf16.cpp | 11 +++---- .../gemm_add_add_fastgelu_xdl_fp16.cpp | 4 +-- .../gemm_add_add_fastgelu_xdl_fp32.cpp | 12 ++++---- .../gemm_add_add_fastgelu_xdl_int4.cpp | 11 +++---- .../gemm_add_add_fastgelu_xdl_int8.cpp | 11 +++---- .../gpu/element/element_wise_operation.hpp | 29 +++++++++++++++++++ 6 files changed, 55 insertions(+), 23 deletions(-) diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp index 5e50c14dc2b..ba0476b9b9e 100644 --- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp +++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp @@ -7,10 +7,11 @@ using ADataType = BF16; using BDataType = BF16; using AccDataType = F32; using CShuffleDataType = F32; -using D0DataType = BF16; -using D1DataType = BF16; -using DsDataType = ck::Tuple; -using EDataType = BF16; +using CDataType = F32; // C matrix doesn't exsit in GPU memory, this is used for host verification +using D0DataType = BF16; +using D1DataType = BF16; +using DsDataType = ck::Tuple; +using EDataType = BF16; using ALayout = Row; using BLayout = Col; @@ -36,7 +37,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp index 1ef266f23df..fa651a34ea8 100644 --- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp +++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -7,10 +6,11 @@ using ADataType = F32; using BDataType = F32; using AccDataType = F32; using CShuffleDataType = F32; -using D0DataType = F32; -using D1DataType = F32; -using DsDataType = ck::Tuple; -using EDataType = F32; +using CDataType = F32; // C matrix doesn't exsit in GPU memory, this is used for host verification +using D0DataType = F32; +using D1DataType = F32; +using DsDataType = ck::Tuple; +using EDataType = F32; using ALayout = Row; using BLayout = Col; @@ -36,7 +36,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; -using EDataType = I4; +using CDataType = I32; // C matrix doesn't exsit in GPU memory, this is used for host verification +using D0DataType = I4; +using D1DataType = I4; +using DsDataType = ck::Tuple; +using EDataType = I4; using KernelADataType = I8; using KernelBDataType = I8; @@ -47,7 +48,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; -using EDataType = I8; +using CDataType = I32; // C matrix doesn't exsit in GPU memory, this is used for host verification +using D0DataType = I8; +using D1DataType = I8; +using DsDataType = ck::Tuple; +using EDataType = I8; using ALayout = Row; using BLayout = Col; @@ -36,7 +37,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm(x1_f); } + + template <> + __host__ __device__ constexpr void operator()( + bhalf_t& e, const float& c, const bhalf_t& d0, const bhalf_t& d1) const + { + const float x0_f = c + type_convert(d0) + type_convert(d1); + + float x1_f = 0; + + ck::tensor_operation::element_wise::FastGelu{}.template operator()(x1_f, + x0_f); + + e = type_convert(x1_f); + } + + template <> + __host__ __device__ constexpr void operator()( + int8_t& e, const int32_t& c, const int8_t& d0, const int8_t& d1) const + { + const float x0_f = + type_convert(c) + type_convert(d0) + type_convert(d1); + + float x1_f = 0; + + ck::tensor_operation::element_wise::FastGelu{}.template operator()(x1_f, + x0_f); + + e = type_convert(x1_f); + } }; struct Normalize From a4772890ca4cf116bbde2e5e818f861be0369740 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 23 Feb 2023 16:09:08 -0500 Subject: [PATCH 09/10] clean --- .../element/unary_element_wise_operation.hpp | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 4da4b31ad71..baaac1b47a8 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -212,10 +212,7 @@ struct Relu struct FastGelu { template - __host__ void operator()(Y& y, const X& x) const; - - template - __device__ void operator()(Y& y, const X& x) const; + __host__ __device__ void operator()(Y& y, const X& x) const; template <> __host__ void operator()(float& y, const float& x) const @@ -227,16 +224,6 @@ struct FastGelu y = x * cdf; } - template <> - __host__ void operator()(half_t& y, const half_t& x) const - { - float y_f; - - this->operator()(y_f, type_convert(x)); - - y = type_convert(y_f); - } - // device code, use lower precision "__expf" and "rcp" template <> __device__ void operator()(float& y, const float& x) const @@ -254,7 +241,7 @@ struct FastGelu } template <> - __device__ void operator()(half_t& y, const half_t& x) const + __host__ __device__ void operator()(half_t& y, const half_t& x) const { float y_f; @@ -264,7 +251,7 @@ struct FastGelu } template <> - __device__ void operator()(half_t& y, const float& x) const + __host__ __device__ void operator()(half_t& y, const float& x) const { float y_f; From 867a75c2b93c737e2c254a3fe9efd095b1e817c1 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 24 Feb 2023 13:07:37 -0500 Subject: [PATCH 10/10] clean --- .../element/unary_element_wise_operation.hpp | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index baaac1b47a8..6b4df3b60e3 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -212,7 +212,10 @@ struct Relu struct FastGelu { template - __host__ __device__ void operator()(Y& y, const X& x) const; + __host__ void operator()(Y& y, const X& x) const; + + template + __device__ void operator()(Y& y, const X& x) const; template <> __host__ void operator()(float& y, const float& x) const @@ -241,7 +244,17 @@ struct FastGelu } template <> - __host__ __device__ void operator()(half_t& y, const half_t& x) const + __host__ void operator()(half_t& y, const half_t& x) const + { + float y_f; + + this->operator()(y_f, type_convert(x)); + + y = type_convert(y_f); + } + + template <> + __device__ void operator()(half_t& y, const half_t& x) const { float y_f; @@ -251,7 +264,17 @@ struct FastGelu } template <> - __host__ __device__ void operator()(half_t& y, const float& x) const + __host__ void operator()(half_t& y, const float& x) const + { + float y_f; + + this->operator()(y_f, x); + + y = type_convert(y_f); + } + + template <> + __device__ void operator()(half_t& y, const float& x) const { float y_f;