From 3a4301659beaebf3b89cbbdd2162fd18c122d527 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Wed, 15 Feb 2023 13:46:00 -0500
Subject: [PATCH 01/10] clean up

---
 example/04_gemm_add_add_fastgelu/common.hpp           |  2 +-
 .../gemm_add_add_fastgelu_xdl_fp16.cpp                | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/example/04_gemm_add_add_fastgelu/common.hpp b/example/04_gemm_add_add_fastgelu/common.hpp
index 3f9375e0926..839587c1489 100644
--- a/example/04_gemm_add_add_fastgelu/common.hpp
+++ b/example/04_gemm_add_add_fastgelu/common.hpp
@@ -62,7 +62,7 @@ struct ExecutionConfig final
 };
 
 inline bool
-parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig config)
+parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
 {
     if(argc == 1)
     {
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
index 6c7ca414448..ae3db1374f9 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -7,10 +7,11 @@ using ADataType        = F16;
 using BDataType        = F16;
 using AccDataType      = F32;
 using CShuffleDataType = F32;
-using D0DataType       = F16;
-using D1DataType       = F16;
-using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
-using EDataType        = F16;
+using CDataType  = F32; // C matrix doesn't exsit in memory, this is used for host verification
+using D0DataType = F16;
+using D1DataType = F16;
+using DsDataType = ck::Tuple<D0DataType, D1DataType>;
+using EDataType  = F16;
 
 using ALayout  = Row;
 using BLayout  = Col;
@@ -36,7 +37,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                         BDataType,
-                                                                        AccDataType,
+                                                                        CDataType,
                                                                         AccDataType,
                                                                         AElementOp,
                                                                         BElementOp,

From 4198de5a9c7e22b28f924b565be09d982a1cfc45 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Wed, 15 Feb 2023 15:49:46 -0500
Subject: [PATCH 02/10] fast gelu using builtin function

---
 .../gemm_add_add_fastgelu_xdl_fp16.cpp        |  4 +-
 .../run_gemm_add_add_fastgelu_example.inc     |  2 +-
 include/ck/ck.hpp                             |  3 +
 .../element/binary_element_wise_operation.hpp | 27 +++++++
 .../gpu/element/element_wise_operation.hpp    | 45 +++++++----
 .../element/unary_element_wise_operation.hpp  | 75 +++++++++++++++++++
 6 files changed, 139 insertions(+), 17 deletions(-)

diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
index ae3db1374f9..487f60d385e 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -6,8 +6,8 @@
 using ADataType        = F16;
 using BDataType        = F16;
 using AccDataType      = F32;
-using CShuffleDataType = F32;
-using CDataType  = F32; // C matrix doesn't exsit in memory, this is used for host verification
+using CShuffleDataType = F16;
+using CDataType  = F16; // C matrix doesn't exsit in GPU memory, this is used for host verification
 using D0DataType = F16;
 using D1DataType = F16;
 using DsDataType = ck::Tuple<D0DataType, D1DataType>;
diff --git a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
index f3def33b567..cb3147bcd71 100644
--- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
@@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
 
     if(config.do_verification)
     {
-        Tensor<AccDataType> c_m_n({M, N});
+        Tensor<CDataType> c_m_n({M, N});
 
         auto ref_gemm    = ReferenceGemmInstance{};
         auto ref_invoker = ref_gemm.MakeInvoker();
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index ffd7e74f123..c6118fbad81 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -168,6 +168,9 @@
 // tuning parameter
 #define CK_WORKAROUND_SWDEV_325164 0
 
+// workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
+#define CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN 1
+
 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0
 
diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index 69fa75c3fd3..cd761f08d4d 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -280,6 +280,7 @@ struct AddHardswish
     };
 };
 
+#if 0
 // C = A * B
 // E = FastGelu(C + D)
 struct AddFastGelu
@@ -319,6 +320,32 @@ struct AddFastGelu
         e = GetFastGeLU(c + type_convert<float>(d));
     }
 };
+#else
+// E = FastGelu(C + D)
+struct AddFastGelu
+{
+    template <typename E, typename C, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float, float, float>(float& e, const float& c, const float& d) const
+    {
+        const float x = c + d;
+
+        FastGelu{}.template operator()<float, float>(e, x);
+    }
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, half_t, half_t>(half_t& e, const half_t& c, const half_t& d) const
+    {
+        const half_t x = c + d;
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<half_t, half_t>(e, x);
+    }
+};
+#endif
 
 } // namespace element_wise
 } // namespace tensor_operation
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 7f3d450a39d..7dd0d1cff16 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -16,7 +16,7 @@ namespace element_wise {
 // Need to ensure compiler will fail if there is no matching candidate, instead of compiler
 // siliently do implicit type conversion
 //
-// Method 1:
+// Example:
 //
 // struct ExampleElementwiseOp
 // {
@@ -30,19 +30,6 @@ namespace element_wise {
 //     {
 //     }
 // };
-//
-// Method 2:
-//
-// template <typename Y, typename X>
-// struct ExampleElementwiseOp;
-//
-// template <>
-// struct ExampleElementwiseOp<float, ck::bhalf_t>
-// {
-//     __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
-//     {
-//     }
-// };
 
 struct AddReluAdd
 {
@@ -208,6 +195,7 @@ struct AddMultiply
     }
 };
 
+#if 0
 // C = A * B
 // E = FastGelu(C + D0 + D1)
 struct AddAddFastGelu
@@ -245,6 +233,35 @@ struct AddAddFastGelu
         e = type_convert<E>(y);
     }
 };
+#else
+// E = FastGelu(C + D0 + D1)
+struct AddAddFastGelu
+{
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float>(float& e,
+                                                                              const float& c,
+                                                                              const float& d0,
+                                                                              const float& d1) const
+    {
+        const float x = c + d0 + d1;
+
+        FastGelu{}.template operator()<float, float>(e, x);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
+        half_t& e, const half_t& c, const half_t& d0, const half_t& d1) const
+    {
+        const half_t x = c + d0 + d1;
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<half_t, half_t>(e, x);
+    }
+};
+#endif
 
 struct Normalize
 {
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 2167a79e019..b9bd19abcc5 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -11,6 +11,10 @@ namespace ck {
 namespace tensor_operation {
 namespace element_wise {
 
+#if CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN
+extern "C" __device__ float __ocml_native_recip_f32(float);
+#endif
+
 struct PassThrough
 {
     template <typename Y, typename X>
@@ -200,6 +204,7 @@ struct Relu
     }
 };
 
+#if 0
 // Y = FastGelu(X)
 struct FastGelu
 {
@@ -232,6 +237,76 @@ struct FastGelu
         y                 = type_convert<Y>(tmp_y);
     }
 };
+#else
+// Fast GeLU
+// https://paperswithcode.com/method/gelu
+// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+// host code use higher accuracy "exp" and "div"
+// gpu code use lower accuracy "__expf" and "rcp" function
+struct FastGelu
+{
+    template <typename Y, typename X>
+    __host__ void operator()(Y& y, const X& x) const;
+
+    template <typename Y, typename X>
+    __device__ void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ void operator()<float, float>(float& y, const float& x) const
+    {
+        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
+        const float emu = exp(-u);
+        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
+
+        y = x * cdf;
+    }
+
+    template <>
+    __host__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, type_convert<float>(x));
+
+        y = type_convert<half_t>(y_f);
+    }
+
+    // device code, use lower precision "__expf" and "rcp"
+    template <>
+    __device__ void operator()<float, float>(float& y, const float& x) const
+    {
+#if 0
+        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
+        const float emu = exp(-u);
+        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
+
+        y = x * cdf;
+#else
+        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
+        const float emu = __expf(-u);
+
+#if !CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN
+        const float cdf = 0.5f + 0.5f * (2.f * __frcp_rn(1.f + emu) - 1.f);
+#else
+        const float cdf = 0.5f + 0.5f * (2.f * __ocml_native_recip_f32(1.f + emu) - 1.f);
+#endif
+
+        y = x * cdf;
+#endif
+    }
+
+    // device code, use lower precision "__expf" and "rcp"
+    template <>
+    __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, type_convert<float>(x));
+
+        y = type_convert<half_t>(y_f);
+    }
+};
+#endif
 
 // https://paperswithcode.com/method/gelu
 // y = 0.5*x*(1+erf(x/sqrt(2)))

From 87f44ead9cb11a668bc82cdb53dcbc8f8a1a7de2 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Wed, 15 Feb 2023 17:03:23 -0500
Subject: [PATCH 03/10] clean

---
 include/ck/ck.hpp                             |  2 +-
 .../element/binary_element_wise_operation.hpp | 42 -----------------
 .../gpu/element/element_wise_operation.hpp    | 40 ----------------
 .../element/unary_element_wise_operation.hpp  | 47 +------------------
 4 files changed, 3 insertions(+), 128 deletions(-)

diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index c6118fbad81..1257a776493 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -169,7 +169,7 @@
 #define CK_WORKAROUND_SWDEV_325164 0
 
 // workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
-#define CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN 1
+#define CK_WORKAROUND_SWDEV_383542 1
 
 // flag to enable (1) or disable (0) the debugging output in some kernels
 #define DEBUG_LOG 0
diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index cd761f08d4d..1d7b973806f 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -280,47 +280,6 @@ struct AddHardswish
     };
 };
 
-#if 0
-// C = A * B
-// E = FastGelu(C + D)
-struct AddFastGelu
-{
-    // Fast GeLU
-    // https://paperswithcode.com/method/gelu
-    // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
-    __host__ __device__ static constexpr float GetFastGeLU(float x)
-    {
-        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
-        const float emu = exp(-u);
-        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
-        return x * cdf;
-    }
-
-    template <typename T>
-    static inline constexpr bool is_valid_param_type_v =
-        std::is_same_v<T, float> || std::is_same_v<T, half_t> || std::is_same_v<T, bhalf_t> ||
-        std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>;
-
-    template <typename E, typename C, typename D>
-    __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const
-    {
-        static_assert(is_valid_param_type_v<E> && is_valid_param_type_v<C> &&
-                      is_valid_param_type_v<D>);
-
-        const float y = GetFastGeLU(type_convert<float>(c) + type_convert<float>(d));
-
-        e = type_convert<E>(y);
-    }
-
-    template <typename D>
-    __host__ __device__ constexpr void operator()(float& e, const float& c, const D& d) const
-    {
-        static_assert(is_valid_param_type_v<D>);
-
-        e = GetFastGeLU(c + type_convert<float>(d));
-    }
-};
-#else
 // E = FastGelu(C + D)
 struct AddFastGelu
 {
@@ -345,7 +304,6 @@ struct AddFastGelu
         ck::tensor_operation::element_wise::FastGelu{}.template operator()<half_t, half_t>(e, x);
     }
 };
-#endif
 
 } // namespace element_wise
 } // namespace tensor_operation
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 7dd0d1cff16..2f60dfd72d6 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -195,45 +195,6 @@ struct AddMultiply
     }
 };
 
-#if 0
-// C = A * B
-// E = FastGelu(C + D0 + D1)
-struct AddAddFastGelu
-{
-    // Fast GeLU
-    // https://paperswithcode.com/method/gelu
-    // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
-    __host__ __device__ static constexpr float GetFastGeLU(float x)
-    {
-        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
-        const float emu = exp(-u);
-        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
-        return x * cdf;
-    }
-
-    template <typename T>
-    static inline constexpr bool is_valid_param_type_v =
-        std::is_same_v<T, float> || std::is_same_v<T, half_t> || std::is_same_v<T, bhalf_t> ||
-        std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>
-#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-        || std::is_same_v<T, ck::int4_t>
-#endif
-        ;
-
-    template <typename E, typename C, typename D0, typename D1>
-    __host__ __device__ constexpr void
-    operator()(E& e, const C& c, const D0& d0, const D1& d1) const
-    {
-        static_assert(is_valid_param_type_v<E> && is_valid_param_type_v<C> &&
-                      is_valid_param_type_v<D0> && is_valid_param_type_v<D1>);
-
-        const float y =
-            GetFastGeLU(type_convert<float>(c) + type_convert<float>(d0) + type_convert<float>(d1));
-
-        e = type_convert<E>(y);
-    }
-};
-#else
 // E = FastGelu(C + D0 + D1)
 struct AddAddFastGelu
 {
@@ -261,7 +222,6 @@ struct AddAddFastGelu
         ck::tensor_operation::element_wise::FastGelu{}.template operator()<half_t, half_t>(e, x);
     }
 };
-#endif
 
 struct Normalize
 {
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index b9bd19abcc5..803aedad396 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -11,7 +11,7 @@ namespace ck {
 namespace tensor_operation {
 namespace element_wise {
 
-#if CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN
+#if CK_WORKAROUND_SWDEV_383542
 extern "C" __device__ float __ocml_native_recip_f32(float);
 #endif
 
@@ -204,40 +204,6 @@ struct Relu
     }
 };
 
-#if 0
-// Y = FastGelu(X)
-struct FastGelu
-{
-    // Fast GeLU
-    // https://paperswithcode.com/method/gelu
-    // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
-    __host__ __device__ static constexpr float GetFastGeLU(float x)
-    {
-        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
-        const float emu = exp(-u);
-        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
-        return x * cdf;
-    }
-
-    template <typename T>
-    static inline constexpr bool is_valid_param_type_v =
-        std::is_same_v<T, float> || std::is_same_v<T, half_t> || std::is_same_v<T, bhalf_t> ||
-        std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>
-#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
-        || std::is_same_v<T, ck::int4_t>
-#endif
-        ;
-
-    template <typename Y, typename X>
-    __host__ __device__ void operator()(Y& y, const X& x) const
-    {
-        static_assert(is_valid_param_type_v<Y> && is_valid_param_type_v<X>);
-
-        const float tmp_y = GetFastGeLU(type_convert<float>(x));
-        y                 = type_convert<Y>(tmp_y);
-    }
-};
-#else
 // Fast GeLU
 // https://paperswithcode.com/method/gelu
 // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
@@ -275,24 +241,16 @@ struct FastGelu
     template <>
     __device__ void operator()<float, float>(float& y, const float& x) const
     {
-#if 0
-        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
-        const float emu = exp(-u);
-        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
-
-        y = x * cdf;
-#else
         const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
         const float emu = __expf(-u);
 
-#if !CK_WORKAROUND_SWDEV_XXXXXX_FRCP_RN
+#if !CK_WORKAROUND_SWDEV_383542
         const float cdf = 0.5f + 0.5f * (2.f * __frcp_rn(1.f + emu) - 1.f);
 #else
         const float cdf = 0.5f + 0.5f * (2.f * __ocml_native_recip_f32(1.f + emu) - 1.f);
 #endif
 
         y = x * cdf;
-#endif
     }
 
     // device code, use lower precision "__expf" and "rcp"
@@ -306,7 +264,6 @@ struct FastGelu
         y = type_convert<half_t>(y_f);
     }
 };
-#endif
 
 // https://paperswithcode.com/method/gelu
 // y = 0.5*x*(1+erf(x/sqrt(2)))

From 58e0fa8abdf487e0e2220ed98bebcdbae2d82ed3 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Wed, 15 Feb 2023 19:39:45 -0500
Subject: [PATCH 04/10] clean

---
 .../gpu/element/binary_element_wise_operation.hpp                | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index 1d7b973806f..09cce54e60f 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 
 namespace ck {
 namespace tensor_operation {

From c390260d871bfd851e2874181320dd1292e14350 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Thu, 16 Feb 2023 13:18:48 -0500
Subject: [PATCH 05/10] clean

---
 .../gpu/element/binary_element_wise_operation.hpp  | 14 ++++++++++++++
 .../gpu/element/element_wise_operation.hpp         | 14 ++++++++++++++
 .../gpu/element/unary_element_wise_operation.hpp   |  1 -
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index 09cce54e60f..79ab1e264d0 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -304,6 +304,20 @@ struct AddFastGelu
 
         ck::tensor_operation::element_wise::FastGelu{}.template operator()<half_t, half_t>(e, x);
     }
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, float, half_t>(half_t& e, const float& c, const half_t& d) const
+    {
+        const float x0_f = c + d;
+
+        float x1_f;
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
+                                                                                         x0_f);
+
+        e = type_convert<half_t>(x1_f);
+    }
 };
 
 } // namespace element_wise
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 2f60dfd72d6..1586247d089 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -221,6 +221,20 @@ struct AddAddFastGelu
 
         ck::tensor_operation::element_wise::FastGelu{}.template operator()<half_t, half_t>(e, x);
     }
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, float, half_t, half_t>(
+        half_t& e, const float& c, const half_t& d0, const half_t& d1) const
+    {
+        const float x0_f = c + d0 + d1;
+
+        float x1_f;
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
+                                                                                         x0_f);
+
+        e = type_convert<half_t>(x1_f);
+    }
 };
 
 struct Normalize
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 803aedad396..ef9764979ed 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -253,7 +253,6 @@ struct FastGelu
         y = x * cdf;
     }
 
-    // device code, use lower precision "__expf" and "rcp"
     template <>
     __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
     {

From 90132e3da529f9fd744b99e373547d2a6130bd6a Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Thu, 16 Feb 2023 14:08:36 -0500
Subject: [PATCH 06/10] clean:

---
 .../gpu/element/binary_element_wise_operation.hpp               | 2 +-
 .../ck/tensor_operation/gpu/element/element_wise_operation.hpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
index 79ab1e264d0..136017c6d17 100644
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -311,7 +311,7 @@ struct AddFastGelu
     {
         const float x0_f = c + d;
 
-        float x1_f;
+        float x1_f = 0;
 
         ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
                                                                                          x0_f);
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 1586247d089..7021b1ccfbb 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -228,7 +228,7 @@ struct AddAddFastGelu
     {
         const float x0_f = c + d0 + d1;
 
-        float x1_f;
+        float x1_f = 0;
 
         ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
                                                                                          x0_f);

From 6fd75022f1b0218a2d32636c0f6791e2800657df Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Thu, 16 Feb 2023 17:46:11 -0500
Subject: [PATCH 07/10] clean

---
 .../gpu/element/unary_element_wise_operation.hpp       | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index ef9764979ed..4da4b31ad71 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -262,6 +262,16 @@ struct FastGelu
 
         y = type_convert<half_t>(y_f);
     }
+
+    template <>
+    __device__ void operator()<half_t, float>(half_t& y, const float& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, x);
+
+        y = type_convert<half_t>(y_f);
+    }
 };
 
 // https://paperswithcode.com/method/gelu

From d8552699d88d40600adf105012222050f58c245f Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Wed, 22 Feb 2023 14:42:40 -0500
Subject: [PATCH 08/10] fix compilation

---
 .../gemm_add_add_fastgelu_xdl_bf16.cpp        | 11 +++----
 .../gemm_add_add_fastgelu_xdl_fp16.cpp        |  4 +--
 .../gemm_add_add_fastgelu_xdl_fp32.cpp        | 12 ++++----
 .../gemm_add_add_fastgelu_xdl_int4.cpp        | 11 +++----
 .../gemm_add_add_fastgelu_xdl_int8.cpp        | 11 +++----
 .../gpu/element/element_wise_operation.hpp    | 29 +++++++++++++++++++
 6 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
index 5e50c14dc2b..ba0476b9b9e 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
@@ -7,10 +7,11 @@ using ADataType        = BF16;
 using BDataType        = BF16;
 using AccDataType      = F32;
 using CShuffleDataType = F32;
-using D0DataType       = BF16;
-using D1DataType       = BF16;
-using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
-using EDataType        = BF16;
+using CDataType  = F32; // C matrix doesn't exsit in GPU memory, this is used for host verification
+using D0DataType = BF16;
+using D1DataType = BF16;
+using DsDataType = ck::Tuple<D0DataType, D1DataType>;
+using EDataType  = BF16;
 
 using ALayout  = Row;
 using BLayout  = Col;
@@ -36,7 +37,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                         BDataType,
-                                                                        AccDataType,
+                                                                        CDataType,
                                                                         AccDataType,
                                                                         AElementOp,
                                                                         BElementOp,
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
index 487f60d385e..b940bfd8973 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -6,8 +6,8 @@
 using ADataType        = F16;
 using BDataType        = F16;
 using AccDataType      = F32;
-using CShuffleDataType = F16;
-using CDataType  = F16; // C matrix doesn't exsit in GPU memory, this is used for host verification
+using CShuffleDataType = F32;
+using CDataType  = F32; // C matrix doesn't exsit in GPU memory, this is used for host verification
 using D0DataType = F16;
 using D1DataType = F16;
 using DsDataType = ck::Tuple<D0DataType, D1DataType>;
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
index 1ef266f23df..fa651a34ea8 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
@@ -1,4 +1,3 @@
-// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
@@ -7,10 +6,11 @@ using ADataType        = F32;
 using BDataType        = F32;
 using AccDataType      = F32;
 using CShuffleDataType = F32;
-using D0DataType       = F32;
-using D1DataType       = F32;
-using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
-using EDataType        = F32;
+using CDataType  = F32; // C matrix doesn't exsit in GPU memory, this is used for host verification
+using D0DataType = F32;
+using D1DataType = F32;
+using DsDataType = ck::Tuple<D0DataType, D1DataType>;
+using EDataType  = F32;
 
 using ALayout  = Row;
 using BLayout  = Col;
@@ -36,7 +36,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                         BDataType,
-                                                                        AccDataType,
+                                                                        CDataType,
                                                                         AccDataType,
                                                                         AElementOp,
                                                                         BElementOp,
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
index 8b5bc9879b2..9f9c423de27 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
@@ -11,10 +11,11 @@ using ADataType        = I4;
 using BDataType        = I4;
 using AccDataType      = I32;
 using CShuffleDataType = I32;
-using D0DataType       = I4;
-using D1DataType       = I4;
-using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
-using EDataType        = I4;
+using CDataType  = I32; // C matrix doesn't exsit in GPU memory, this is used for host verification
+using D0DataType = I4;
+using D1DataType = I4;
+using DsDataType = ck::Tuple<D0DataType, D1DataType>;
+using EDataType  = I4;
 
 using KernelADataType  = I8;
 using KernelBDataType  = I8;
@@ -47,7 +48,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                         BDataType,
-                                                                        AccDataType,
+                                                                        CDataType,
                                                                         AccDataType,
                                                                         AElementOp,
                                                                         BElementOp,
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
index b236f5e9987..fadc4ef5ee4 100644
--- a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
@@ -7,10 +7,11 @@ using ADataType        = I8;
 using BDataType        = I8;
 using AccDataType      = I32;
 using CShuffleDataType = I32;
-using D0DataType       = I8;
-using D1DataType       = I8;
-using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
-using EDataType        = I8;
+using CDataType  = I32; // C matrix doesn't exsit in GPU memory, this is used for host verification
+using D0DataType = I8;
+using D1DataType = I8;
+using DsDataType = ck::Tuple<D0DataType, D1DataType>;
+using EDataType  = I8;
 
 using ALayout  = Row;
 using BLayout  = Col;
@@ -36,7 +37,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_C
 
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                         BDataType,
-                                                                        AccDataType,
+                                                                        CDataType,
                                                                         AccDataType,
                                                                         AElementOp,
                                                                         BElementOp,
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 7021b1ccfbb..ceb2b665b91 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -235,6 +235,35 @@ struct AddAddFastGelu
 
         e = type_convert<half_t>(x1_f);
     }
+
+    template <>
+    __host__ __device__ constexpr void operator()<bhalf_t, float, bhalf_t, bhalf_t>(
+        bhalf_t& e, const float& c, const bhalf_t& d0, const bhalf_t& d1) const
+    {
+        const float x0_f = c + type_convert<float>(d0) + type_convert<float>(d1);
+
+        float x1_f = 0;
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
+                                                                                         x0_f);
+
+        e = type_convert<bhalf_t>(x1_f);
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<int8_t, int32_t, int8_t, int8_t>(
+        int8_t& e, const int32_t& c, const int8_t& d0, const int8_t& d1) const
+    {
+        const float x0_f =
+            type_convert<float>(c) + type_convert<float>(d0) + type_convert<float>(d1);
+
+        float x1_f = 0;
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(x1_f,
+                                                                                         x0_f);
+
+        e = type_convert<int8_t>(x1_f);
+    }
 };
 
 struct Normalize

From a4772890ca4cf116bbde2e5e818f861be0369740 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Thu, 23 Feb 2023 16:09:08 -0500
Subject: [PATCH 09/10] clean

---
 .../element/unary_element_wise_operation.hpp  | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 4da4b31ad71..baaac1b47a8 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -212,10 +212,7 @@ struct Relu
 struct FastGelu
 {
     template <typename Y, typename X>
-    __host__ void operator()(Y& y, const X& x) const;
-
-    template <typename Y, typename X>
-    __device__ void operator()(Y& y, const X& x) const;
+    __host__ __device__ void operator()(Y& y, const X& x) const;
 
     template <>
     __host__ void operator()<float, float>(float& y, const float& x) const
@@ -227,16 +224,6 @@ struct FastGelu
         y = x * cdf;
     }
 
-    template <>
-    __host__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
-    {
-        float y_f;
-
-        this->operator()<float, float>(y_f, type_convert<float>(x));
-
-        y = type_convert<half_t>(y_f);
-    }
-
     // device code, use lower precision "__expf" and "rcp"
     template <>
     __device__ void operator()<float, float>(float& y, const float& x) const
@@ -254,7 +241,7 @@ struct FastGelu
     }
 
     template <>
-    __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
     {
         float y_f;
 
@@ -264,7 +251,7 @@ struct FastGelu
     }
 
     template <>
-    __device__ void operator()<half_t, float>(half_t& y, const float& x) const
+    __host__ __device__ void operator()<half_t, float>(half_t& y, const float& x) const
     {
         float y_f;
 

From 867a75c2b93c737e2c254a3fe9efd095b1e817c1 Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Fri, 24 Feb 2023 13:07:37 -0500
Subject: [PATCH 10/10] clean

---
 .../element/unary_element_wise_operation.hpp  | 29 +++++++++++++++++--
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index baaac1b47a8..6b4df3b60e3 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -212,7 +212,10 @@ struct Relu
 struct FastGelu
 {
     template <typename Y, typename X>
-    __host__ __device__ void operator()(Y& y, const X& x) const;
+    __host__ void operator()(Y& y, const X& x) const;
+
+    template <typename Y, typename X>
+    __device__ void operator()(Y& y, const X& x) const;
 
     template <>
     __host__ void operator()<float, float>(float& y, const float& x) const
@@ -241,7 +244,17 @@ struct FastGelu
     }
 
     template <>
-    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    __host__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, type_convert<float>(x));
+
+        y = type_convert<half_t>(y_f);
+    }
+
+    template <>
+    __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
     {
         float y_f;
 
@@ -251,7 +264,17 @@ struct FastGelu
     }
 
     template <>
-    __host__ __device__ void operator()<half_t, float>(half_t& y, const float& x) const
+    __host__ void operator()<half_t, float>(half_t& y, const float& x) const
+    {
+        float y_f;
+
+        this->operator()<float, float>(y_f, x);
+
+        y = type_convert<half_t>(y_f);
+    }
+
+    template <>
+    __device__ void operator()<half_t, float>(half_t& y, const float& x) const
     {
         float y_f;