From c19ea8e37fc8b4cfa9430279b7974ecb588d489f Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Tue, 19 Apr 2022 15:34:19 +0800
Subject: [PATCH 01/32] add intrin_mfma_f64_16x16x4f64

---
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 23 +++++++++++++++++++
 include/ck/utility/amd_xdlops.hpp             | 13 +++++++++++
 2 files changed, 36 insertions(+)
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 9d72abb72ea..cc993e012ba 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -25,6 +25,7 @@ enum struct MfmaInstr
     mfma_f32_16x16x8bf16,
     mfma_i32_32x32x8i8,
     mfma_i32_16x16x16i8,
+    mfma_f64_16x16x4f64,
 };
 
 template <MfmaInstr instr>
@@ -383,6 +384,28 @@ struct mfma_type<MfmaInstr::mfma_i32_16x16x16i8>
     }
 };
 
+template <>
+struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = group_size * num_groups_per_blk;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = wave_size / num_threads_per_blk;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f64_16x16x4f64<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
 template <typename base_type, index_t MPerXdlops, index_t NPerXdlops>
 struct MfmaSelector
 {
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 91d109bae10..09f10dd32f3 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -294,5 +294,18 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
     }
 };
 
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f64_16x16x4f64;
+
+template <>
+struct intrin_mfma_f64_16x16x4f64<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
+            reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
 } // namespace ck
 #endif

From bf5af9f91cf2ab0970bd272a5329001188a9f8be Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Wed, 20 Apr 2022 15:42:48 +0800
Subject: [PATCH 02/32] add example

---
 example/01_gemm/CMakeLists.txt                |   1 +
 example/01_gemm/gemm_xdl_fp64.cpp             | 205 ++++++++++++++++++
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |  16 +-
 include/ck/utility/amd_xdlops.hpp             |   2 +-
 4 files changed, 218 insertions(+), 6 deletions(-)
 create mode 100644 example/01_gemm/gemm_xdl_fp64.cpp

diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
index 696d3bac42d..6da2ecb0c52 100644
--- a/example/01_gemm/CMakeLists.txt
+++ b/example/01_gemm/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
 add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
 add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
+add_example_executable(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
new file mode 100644
index 00000000000..f62c787d2e5
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -0,0 +1,205 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_xdl.hpp"
+#include "device_gemm_xdl_c_shuffle.hpp"
+#include "device_gemm_xdl_cshuffle.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F64 = double;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = double;
+using BDataType   = double;
+using CDataType   = double;
+using AccDataType = double;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
+//##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+//##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+//##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+//##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+    }
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, nrepeat);
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+    }
+
+    return 0;
+}
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index cc993e012ba..cb3ca891324 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -389,14 +389,14 @@ struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
 {
     static constexpr index_t group_size          = 4;
     static constexpr index_t num_groups_per_blk  = 1;
-    static constexpr index_t num_regs_per_blk    = group_size * num_groups_per_blk;
+    static constexpr index_t num_regs_per_blk    = 4; //group_size * num_groups_per_blk;
     static constexpr index_t num_threads_per_blk = 16;
     static constexpr index_t wave_size           = 64;
-    static constexpr index_t num_input_blks      = wave_size / num_threads_per_blk;
+    static constexpr index_t num_input_blks      = 4; //wave_size / num_threads_per_blk;
     static constexpr index_t num_output_blks     = 1;
     static constexpr index_t m_per_blk           = 16;
     static constexpr index_t n_per_blk           = 16;
-    static constexpr index_t k_per_blk           = 4;
+    static constexpr index_t k_per_blk           = 1;
     static constexpr bool is_k_reduction         = true;
 
     template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
@@ -412,6 +412,12 @@ struct MfmaSelector
     template <typename base_type_, index_t MPerXdlops_, index_t NPerXdlops_>
     static constexpr auto GetMfma();
 
+    template <>
+    static constexpr auto GetMfma<double, 32, 32>()
+    {
+        return MfmaInstr::mfma_f64_16x16x4f64;
+    }
+
     template <>
     static constexpr auto GetMfma<float, 64, 64>()
     {
@@ -684,9 +690,9 @@ struct XdlopsGemm
     template <class FloatA, class FloatB, class FloatC>
     __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
     {
-        static_assert(is_same<base_type, float>::value || is_same<base_type, half_t>::value ||
+        static_assert(is_same<base_type, double>::value ||is_same<base_type, float>::value || is_same<base_type, half_t>::value ||
                           is_same<base_type, bhalf_t>::value || is_same<base_type, int8_t>::value,
-                      "base base_type must be float, half, bfloat16, and int8_t!");
+                      "base base_type must be double, float, half, bfloat16, and int8_t!");
 
         static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
             mfma_instr.template run<MPerXdlops, NPerXdlops>(p_a_wave[k], p_b_wave[k], p_c_thread);
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 09f10dd32f3..7ab6f7f7cce 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -298,7 +298,7 @@ template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f64_16x16x4f64;
 
 template <>
-struct intrin_mfma_f64_16x16x4f64<16, 16>
+struct intrin_mfma_f64_16x16x4f64<32, 32>
 {
     template <class FloatC>
     __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)

From 6c8ca54bb6ed2d2bbc2853ed20e0d377cf7c390d Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Wed, 20 Apr 2022 22:52:16 +0800
Subject: [PATCH 03/32] gemm reference add double data type

---
 example/01_gemm/gemm_xdl_bf16.cpp             |  2 +-
 example/01_gemm/gemm_xdl_fp16.cpp             |  2 +-
 example/01_gemm/gemm_xdl_fp64.cpp             |  2 +-
 example/01_gemm/gemm_xdl_int8.cpp             |  9 ++++++--
 .../conv2d_bwd_weight_xdl.cpp                 |  9 ++++++--
 .../gemm_xdl_requant_relu_requant_int8.cpp    |  9 ++++++--
 .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp |  2 +-
 .../16_gemm_reduce/gemm_reduce_xdl_fp16.cpp   |  2 +-
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 23 ++++++++++---------
 .../cpu/reference_gemm.hpp                    |  9 ++++----
 profiler/include/profile_gemm_impl.hpp        | 12 ++++++++--
 profiler/include/profile_gemm_reduce_impl.hpp |  9 ++++++--
 .../include/profile_grouped_gemm_impl.hpp     |  2 ++
 profiler/src/profile_gemm.cpp                 | 16 +++++++++++++
 test/gemm/gemm_fp16.cpp                       | 11 ++++++---
 test/gemm/gemm_fp32.cpp                       | 15 ++++++++----
 test/gemm/gemm_int8.cpp                       | 15 ++++++++----
 test/gemm/gemm_util.hpp                       |  3 +++
 test/grouped_gemm/grouped_gemm_fp16.cpp       |  9 ++++++--
 19 files changed, 116 insertions(+), 45 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
index 8f0631c1cec..fd24d516222 100644
--- a/example/01_gemm/gemm_xdl_bf16.cpp
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -81,7 +81,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<float, float, float, PassThrough, PassThrough, PassThrough>;
+    ReferenceGemm<float, float, float, float, PassThrough, PassThrough, PassThrough>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
index 2d5a95e400c..1e5c1adb715 100644
--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index f62c787d2e5..579dd186ef8 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
index 724757565ea..3d92490bdcf 100644
--- a/example/01_gemm/gemm_xdl_int8.cpp
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -80,8 +80,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
     4>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
-using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
index 7b74b40d328..bf78cc87e06 100644
--- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
@@ -72,8 +72,13 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
         8>;                               // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
-using ReferenceConvBwdWeightInstance = ck::tensor_operation::host::
-    ReferenceConvBwdWeight<InDataType, WeiDataType, OutDataType, InElementOp, WeiElementOp, OutElementOp>;
+using ReferenceConvBwdWeightInstance =
+    ck::tensor_operation::host::ReferenceConvBwdWeight<InDataType,
+                                                       WeiDataType,
+                                                       OutDataType,
+                                                       InElementOp,
+                                                       WeiElementOp,
+                                                       OutElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
index ca3b58bd00a..c125d6710d4 100644
--- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
@@ -83,8 +83,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle
     16>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on
 
-using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, RequantReluRequant>;
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        CDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        RequantReluRequant>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
index 4e9bdbb2f5b..30d852bccda 100644
--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -56,7 +56,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
index 0346075c368..7142e8a1235 100644
--- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
@@ -52,7 +52,7 @@ using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index cb3ca891324..6728e2b89ac 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -387,17 +387,17 @@ struct mfma_type<MfmaInstr::mfma_i32_16x16x16i8>
 template <>
 struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
 {
-    static constexpr index_t group_size          = 4;
-    static constexpr index_t num_groups_per_blk  = 1;
-    static constexpr index_t num_regs_per_blk    = 4; //group_size * num_groups_per_blk;
+    static constexpr index_t group_size         = 4;
+    static constexpr index_t num_groups_per_blk = 1;
+    static constexpr index_t num_regs_per_blk = 4; // group_size * num_groups_per_blk;
     static constexpr index_t num_threads_per_blk = 16;
     static constexpr index_t wave_size           = 64;
-    static constexpr index_t num_input_blks      = 4; //wave_size / num_threads_per_blk;
-    static constexpr index_t num_output_blks     = 1;
-    static constexpr index_t m_per_blk           = 16;
-    static constexpr index_t n_per_blk           = 16;
-    static constexpr index_t k_per_blk           = 1;
-    static constexpr bool is_k_reduction         = true;
+    static constexpr index_t num_input_blks = 4; // wave_size / num_threads_per_blk;
+    static constexpr index_t num_output_blks = 1;
+    static constexpr index_t m_per_blk       = 16;
+    static constexpr index_t n_per_blk       = 16;
+    static constexpr index_t k_per_blk       = 1;
+    static constexpr bool is_k_reduction     = true;
 
     template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
     __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
@@ -690,8 +690,9 @@ struct XdlopsGemm
     template <class FloatA, class FloatB, class FloatC>
     __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
     {
-        static_assert(is_same<base_type, double>::value ||is_same<base_type, float>::value || is_same<base_type, half_t>::value ||
-                          is_same<base_type, bhalf_t>::value || is_same<base_type, int8_t>::value,
+        static_assert(is_same<base_type, double>::value || is_same<base_type, float>::value ||
+                          is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
+                          is_same<base_type, int8_t>::value,
                       "base base_type must be double, float, half, bfloat16, and int8_t!");
 
         static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index 3601fafc281..e659f2d0e82 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -13,6 +13,7 @@ namespace host {
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
+          typename AccDataType,
           typename AElementwiseOperation,
           typename BElementwiseOperation,
           typename CElementwiseOperation>
@@ -55,12 +56,12 @@ struct ReferenceGemm : public device::BaseOperator
             auto f_mk_kn_mn = [&](auto m, auto n) {
                 const int K = arg.a_m_k_.mDesc.GetLengths()[1];
 
-                float v_acc = 0;
+                AccDataType v_acc = 0;
 
                 for(int k = 0; k < K; ++k)
                 {
-                    float v_a;
-                    float v_b;
+                    AccDataType v_a;
+                    AccDataType v_b;
 
                     arg.a_element_op_(v_a, static_cast<const float>(arg.a_m_k_(m, k)));
                     arg.b_element_op_(v_b, static_cast<const float>(arg.b_k_n_(k, n)));
@@ -68,7 +69,7 @@ struct ReferenceGemm : public device::BaseOperator
                     v_acc += v_a * v_b;
                 }
 
-                float v_c;
+                AccDataType v_c;
 
                 arg.c_element_op_(v_c, v_acc);
 
diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp
index f2661888442..0faac583753 100644
--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -85,6 +85,7 @@ namespace profiler {
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
+          typename AccDataType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
@@ -457,8 +458,14 @@ void profile_gemm_impl(int do_verification,
                     bf16_to_f32_(b_k_n, b_f32_k_n);
                     bf16_to_f32_(c_m_n_device_result, c_m_n_device_f32_result);
 
-                    using ReferenceGemmInstance = ck::tensor_operation::host::
-                        ReferenceGemm<float, float, float, AElementOp, BElementOp, CElementOp>;
+                    using ReferenceGemmInstance =
+                        ck::tensor_operation::host::ReferenceGemm<float,
+                                                                  float,
+                                                                  float,
+                                                                  float,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  CElementOp>;
 
                     auto ref_gemm    = ReferenceGemmInstance{};
                     auto ref_invoker = ref_gemm.MakeInvoker();
@@ -490,6 +497,7 @@ void profile_gemm_impl(int do_verification,
                         ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                   BDataType,
                                                                   CDataType,
+                                                                  AccDataType,
                                                                   AElementOp,
                                                                   BElementOp,
                                                                   CElementOp>;
diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp
index e103aeff99e..ca4f4f63fad 100644
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -127,8 +127,13 @@ bool profile_gemm_reduce_impl(int do_verification,
 
     if(do_verification)
     {
-        using ReferenceGemmInstance = ck::tensor_operation::host::
-            ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                DDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp>;
 
         auto ref_gemm    = ReferenceGemmInstance{};
         auto ref_invoker = ref_gemm.MakeInvoker();
diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp
index cced480c36c..8bb87f97632 100644
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -43,6 +43,7 @@ namespace profiler {
 template <typename ADataType,
           typename BDataType,
           typename CDataType,
+          typename AccDataType,
           typename ALayout,
           typename BLayout,
           typename CLayout>
@@ -270,6 +271,7 @@ void profile_grouped_gemm_impl(int do_verification,
                         ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                   BDataType,
                                                                   CDataType,
+                                                                  AccDataType,
                                                                   AElementOp,
                                                                   BElementOp,
                                                                   CElementOp>;
diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp
index 7a72be2d8e9..ec70bb49727 100644
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -68,6 +68,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<ck::half_t,
                                         ck::half_t,
                                         ck::half_t,
+                                        float,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -88,6 +89,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<ck::half_t,
                                         ck::half_t,
                                         ck::half_t,
+                                        float,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -108,6 +110,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<ck::half_t,
                                         ck::half_t,
                                         ck::half_t,
+                                        float,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -128,6 +131,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<ck::half_t,
                                         ck::half_t,
                                         ck::half_t,
+                                        float,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -146,6 +150,7 @@ int profile_gemm(int argc, char* argv[])
     else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         ck::profiler::profile_gemm_impl<float,
+                                        float,
                                         float,
                                         float,
                                         ck::tensor_layout::gemm::RowMajor,
@@ -166,6 +171,7 @@ int profile_gemm(int argc, char* argv[])
     else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         ck::profiler::profile_gemm_impl<float,
+                                        float,
                                         float,
                                         float,
                                         ck::tensor_layout::gemm::RowMajor,
@@ -186,6 +192,7 @@ int profile_gemm(int argc, char* argv[])
     else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
     {
         ck::profiler::profile_gemm_impl<float,
+                                        float,
                                         float,
                                         float,
                                         ck::tensor_layout::gemm::ColumnMajor,
@@ -206,6 +213,7 @@ int profile_gemm(int argc, char* argv[])
     else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
     {
         ck::profiler::profile_gemm_impl<float,
+                                        float,
                                         float,
                                         float,
                                         ck::tensor_layout::gemm::ColumnMajor,
@@ -228,6 +236,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<int8_t,
                                         int8_t,
                                         int8_t,
+                                        int32_t,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -248,6 +257,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<int8_t,
                                         int8_t,
                                         int8_t,
+                                        int32_t,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -268,6 +278,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<int8_t,
                                         int8_t,
                                         int8_t,
+                                        int32_t,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -288,6 +299,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<int8_t,
                                         int8_t,
                                         int8_t,
+                                        int32_t,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -308,6 +320,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<ck::bhalf_t,
                                         ck::bhalf_t,
                                         ck::bhalf_t,
+                                        float,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -328,6 +341,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<ck::bhalf_t,
                                         ck::bhalf_t,
                                         ck::bhalf_t,
+                                        float,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -348,6 +362,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<ck::bhalf_t,
                                         ck::bhalf_t,
                                         ck::bhalf_t,
+                                        float,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::RowMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
@@ -368,6 +383,7 @@ int profile_gemm(int argc, char* argv[])
         ck::profiler::profile_gemm_impl<ck::bhalf_t,
                                         ck::bhalf_t,
                                         ck::bhalf_t,
+                                        float,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::ColumnMajor,
                                         ck::tensor_layout::gemm::RowMajor>(
diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp
index d7669bb2425..b2c15804ccb 100644
--- a/test/gemm/gemm_fp16.cpp
+++ b/test/gemm/gemm_fp16.cpp
@@ -52,9 +52,10 @@ void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
 
 int main()
 {
-    using ADataType = ck::half_t;
-    using BDataType = ck::half_t;
-    using CDataType = ck::half_t;
+    using ADataType   = ck::half_t;
+    using BDataType   = ck::half_t;
+    using CDataType   = ck::half_t;
+    using AccDataType = float;
 
     using RowMajor    = ck::tensor_layout::gemm::RowMajor;
     using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
@@ -74,6 +75,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        RowMajor,
                                        RowMajor,
@@ -96,6 +98,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        ColumnMajor,
                                        RowMajor,
@@ -118,6 +121,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        RowMajor,
                                        RowMajor,
@@ -142,6 +146,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        ColumnMajor,
                                        RowMajor,
diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp
index 6c86085f3b8..72ff08f427f 100644
--- a/test/gemm/gemm_fp32.cpp
+++ b/test/gemm/gemm_fp32.cpp
@@ -53,12 +53,13 @@ void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(std::vector<De
 
 int main()
 {
-    using ADataType = float;
-    using BDataType = float;
-    using CDataType = float;
+    using ADataType   = float;
+    using BDataType   = float;
+    using CDataType   = float;
+    using AccDataType = float,
 
-    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
-    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
+          using RowMajor = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor    = ck::tensor_layout::gemm::ColumnMajor;
 
     bool res = true;
     std::vector<DeviceGemmNoOpPtr> gemmPtrs;
@@ -75,6 +76,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        RowMajor,
                                        RowMajor,
@@ -97,6 +99,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        ColumnMajor,
                                        RowMajor,
@@ -119,6 +122,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        RowMajor,
                                        RowMajor,
@@ -141,6 +145,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        ColumnMajor,
                                        RowMajor,
diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp
index 864fca8df4d..cc38da757a0 100644
--- a/test/gemm/gemm_int8.cpp
+++ b/test/gemm/gemm_int8.cpp
@@ -46,12 +46,13 @@ void add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances(
 
 int main()
 {
-    using ADataType = int8_t;
-    using BDataType = int8_t;
-    using CDataType = int8_t;
+    using ADataType   = int8_t;
+    using BDataType   = int8_t;
+    using CDataType   = int8_t;
+    using AccDataType = int32_t,
 
-    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
-    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
+          using RowMajor = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor    = ck::tensor_layout::gemm::ColumnMajor;
 
     std::vector<DeviceGemmNoOpPtr> gemmPtrs;
     bool res = true;
@@ -65,6 +66,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        RowMajor,
                                        RowMajor,
@@ -83,6 +85,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        ColumnMajor,
                                        RowMajor,
@@ -101,6 +104,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        RowMajor,
                                        RowMajor,
@@ -119,6 +123,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        ColumnMajor,
                                        RowMajor,
diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 08c8edfb94b..f8de85ea883 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -106,6 +106,7 @@ template <typename DeviceGemmPtr_,
           typename ADataType,
           typename BDataType,
           typename CDataType,
+          typename AccDataType,
           typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -188,6 +189,7 @@ struct TestGemm
             ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                       BDataType,
                                                       CDataType,
+                                                      AccDataType,
                                                       AElementwiseOperation,
                                                       BElementwiseOperation,
                                                       CElementwiseOperation>;
@@ -306,6 +308,7 @@ struct TestGemmBF16
         // use fp32 host kernel to verify bf16 device kernel
         using ReferenceGemmInstance =
             ck::tensor_operation::host::ReferenceGemm<float,
+                                                      float,
                                                       float,
                                                       float,
                                                       AElementwiseOperation,
diff --git a/test/grouped_gemm/grouped_gemm_fp16.cpp b/test/grouped_gemm/grouped_gemm_fp16.cpp
index 2260b01462f..0d633074f92 100644
--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -151,8 +151,13 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
     {
         c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
 
-        using ReferenceGemmInstance = ck::tensor_operation::host::
-            ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
 
         auto ref_gemm    = ReferenceGemmInstance{};
         auto ref_invoker = ref_gemm.MakeInvoker();

From 10a2ae2f76405bd63c58e451da867ce87766ca2d Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Thu, 21 Apr 2022 02:11:29 +0000
Subject: [PATCH 04/32] chang init data

---
 example/01_gemm/gemm_xdl_fp64.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 579dd186ef8..b19dc6266c2 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -135,8 +135,8 @@ int main(int argc, char* argv[])
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
         break;
     default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
-        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
     }
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());

From 873d09585a0572bdf11ddbb7794e493d632a5034 Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Thu, 21 Apr 2022 13:21:24 +0800
Subject: [PATCH 05/32] fix M N PerXdlops

---
 example/01_gemm/gemm_xdl_fp64.cpp             |  9 ++++++++-
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 20 +++++++++----------
 include/ck/utility/amd_xdlops.hpp             |  7 ++++++-
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index b19dc6266c2..6a323e4fc25 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -50,7 +50,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
 //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
+             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
@@ -198,6 +198,13 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
+        if(0)
+        {
+            LogRangeAsType<double>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+            LogRangeAsType<double>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+            LogRangeAsType<double>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                << std::endl;
+        }
         ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
     }
 
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 6728e2b89ac..4d9a40e9547 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -387,17 +387,17 @@ struct mfma_type<MfmaInstr::mfma_i32_16x16x16i8>
 template <>
 struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
 {
-    static constexpr index_t group_size         = 4;
-    static constexpr index_t num_groups_per_blk = 1;
-    static constexpr index_t num_regs_per_blk = 4; // group_size * num_groups_per_blk;
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4; // group_size * num_groups_per_blk;
     static constexpr index_t num_threads_per_blk = 16;
     static constexpr index_t wave_size           = 64;
-    static constexpr index_t num_input_blks = 4; // wave_size / num_threads_per_blk;
-    static constexpr index_t num_output_blks = 1;
-    static constexpr index_t m_per_blk       = 16;
-    static constexpr index_t n_per_blk       = 16;
-    static constexpr index_t k_per_blk       = 1;
-    static constexpr bool is_k_reduction     = true;
+    static constexpr index_t num_input_blks      = 4; // wave_size / num_threads_per_blk;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 1;
+    static constexpr bool is_k_reduction         = true;
 
     template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
     __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
@@ -413,7 +413,7 @@ struct MfmaSelector
     static constexpr auto GetMfma();
 
     template <>
-    static constexpr auto GetMfma<double, 32, 32>()
+    static constexpr auto GetMfma<double, 16, 16>()
     {
         return MfmaInstr::mfma_f64_16x16x4f64;
     }
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 7ab6f7f7cce..00014dff2ad 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -298,13 +298,18 @@ template <index_t MPerWave, index_t NPerWave>
 struct intrin_mfma_f64_16x16x4f64;
 
 template <>
-struct intrin_mfma_f64_16x16x4f64<32, 32>
+struct intrin_mfma_f64_16x16x4f64<16, 16>
 {
     template <class FloatC>
     __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)
     {
+#ifdef __gxf90a__
         reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
             reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
+#else
+        reg_c.template AsType<double4_t>()(Number<0>{}) = {reg_a, reg_a, reg_b, reg_b};
+
+#endif
     }
 };
 } // namespace ck

From eff586ac3a27b0c4ced4bf69aa39119638f5bb96 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Thu, 21 Apr 2022 14:58:36 +0000
Subject: [PATCH 06/32] fix ifdef

---
 example/01_gemm/gemm_xdl_fp64.cpp | 12 ++++++++----
 include/ck/utility/amd_xdlops.hpp |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 6a323e4fc25..fe2bab01cd3 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -50,7 +50,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
 //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
+             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,               7,               1>;
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
@@ -135,8 +135,10 @@ int main(int argc, char* argv[])
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
         break;
     default:
+        //a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
         a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        //b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
     }
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
@@ -198,13 +200,15 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-        if(0)
+ #if 1
         {
             LogRangeAsType<double>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
             LogRangeAsType<double>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-            LogRangeAsType<double>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+            LogRangeAsType<double>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl;
+            LogRangeAsType<double>(std::cout << "c_host: ", c_m_n_host_result.mData, ",")
                 << std::endl;
         }
+#endif
         ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
     }
 
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 00014dff2ad..e4f24ee2d5a 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -303,7 +303,7 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
     template <class FloatC>
     __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)
     {
-#ifdef __gxf90a__
+#ifdef __gfx90a__
         reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
             reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
 #else

From d443a7a675b1c84a88e15e9fd6cce8d30f76f0d1 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Thu, 21 Apr 2022 16:16:42 +0000
Subject: [PATCH 07/32] add comparsion config

---
 example/01_gemm/gemm_xdl_fp64.cpp | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index fe2bab01cd3..d3a42f45f76 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -23,16 +23,13 @@ using S = ck::Sequence<Is...>;
 
 using F64 = double;
 using F32 = float;
+using F16 = ck::half_t;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-using ADataType   = double;
-using BDataType   = double;
-using CDataType   = double;
-using AccDataType = double;
 
 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
@@ -50,7 +47,19 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
 //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,               7,               1>;
+#if 1
+             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
+using ADataType   = double;
+using BDataType   = double;
+using CDataType   = double;
+using AccDataType = double;
+#else
+            <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>;
+using ADataType   = float;
+using BDataType   = float;
+using CDataType   = float;
+using AccDataType = float;
+#endif
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
@@ -135,8 +144,8 @@ int main(int argc, char* argv[])
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
         break;
     default:
-        //a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        //a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}); 
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
         a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
         //b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
     }
@@ -205,7 +214,7 @@ int main(int argc, char* argv[])
             LogRangeAsType<double>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
             LogRangeAsType<double>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
             LogRangeAsType<double>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl;
-            LogRangeAsType<double>(std::cout << "c_host: ", c_m_n_host_result.mData, ",")
+            LogRangeAsType<double>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
                 << std::endl;
         }
 #endif

From dcdbed2a47a7cff9ded53df524965d26aaa5a5af Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Sat, 23 Apr 2022 04:32:16 +0000
Subject: [PATCH 08/32] add conv fwd example

---
 example/01_gemm/gemm_xdl_fp64.cpp             |  19 +-
 example/09_convnd_fwd/CMakeLists.txt          |   1 +
 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 346 ++++++++++++++++++
 3 files changed, 356 insertions(+), 10 deletions(-)
 create mode 100644 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index d3a42f45f76..625d11d9459 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -30,7 +30,6 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-
 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
 using CLayout = ck::tensor_layout::gemm::RowMajor;
@@ -48,7 +47,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
 #if 1
-             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
+             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>;
 using ADataType   = double;
 using BDataType   = double;
 using CDataType   = double;
@@ -144,10 +143,10 @@ int main(int argc, char* argv[])
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
         break;
     default:
-        //a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}); 
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        // a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
         a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        //b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        // b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
     }
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
@@ -209,12 +208,12 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
- #if 1
+#if 0
         {
-            LogRangeAsType<double>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-            LogRangeAsType<double>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-            LogRangeAsType<double>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl;
-            LogRangeAsType<double>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+            LogRangeAsType<AccDataType>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+            LogRangeAsType<AccDataType>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+            LogRangeAsType<AccDataType>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl;
+            LogRangeAsType<AccDataType>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
                 << std::endl;
         }
 #endif
diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt
index fd6d11d9ff2..a142074696c 100644
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp)
 add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
 add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
+add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
new file mode 100644
index 00000000000..e933dd3dc56
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -0,0 +1,346 @@
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "check_err.hpp"
+#include "config.hpp"
+#include "conv_fwd_util.hpp"
+#include "device.hpp"
+#include "device_tensor.hpp"
+#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "reference_conv_fwd.hpp"
+#include "tensor_layout.hpp"
+
+namespace {
+
+using InDataType  = double;
+using WeiDataType = double;
+using OutDataType = double;
+using AccDataType = double;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+using DeviceConvFwdBasePtr =
+    ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;
+
+template <ck::index_t NumDimSpatial>
+using DeviceConvNDFwdInstance = ck::tensor_operation::device::
+    DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        // clang-format off
+        InDataType,         // 
+        WeiDataType,        //
+        OutDataType,        //
+        AccDataType,        // 
+        InElementOp,        // Input Elementwise Operation
+        WeiElementOp,       // Weights Elementwise Operation
+        OutElementOp,       // Output Elementwise Operation
+        ConvFwdDefault,     // ConvForwardSpecialization
+        NumDimSpatial,      // NumDimSpatial
+        256,                // BlockSize
+        128,                // MPerBlock
+        128,                // NPerBlock
+        4,                  // K0PerBlock
+        2,                  // K1
+        16,                 // MPerXDL
+        16,                 // NPerXDL
+        4,                  // MXdlPerWave
+        4,                  // NXdlPerWave
+        S<4, 64, 1>,        // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,         // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,         // ABlockTransferSrcAccessOrder
+        2,                  // ABlockTransferSrcVectorDim
+        2,                  // ABlockTransferSrcScalarPerVector
+        2,                  // ABlockTransferDstScalarPerVector_K1
+        true,               // ABlockLdsAddExtraM
+        S<4, 64, 1>,        // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 0, 2>,         // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,         // BBlockTransferSrcAccessOrder
+        2,                  // BBlockTransferSrcVectorDim
+        2,                  // BBlockTransferSrcScalarPerVector
+        2,                  // BBlockTransferDstScalarPerVector_K1
+        true,               // BBlockTransferAddExtraN
+        7,                  // CThreadTransferSrcDstVectorDim
+        1>;                 // CThreadTransferDstScalarPerVector
+// clang-format on
+
+template <ck::index_t NumDimSpatial>
+using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                                WeiDataType,
+                                                                                OutDataType,
+                                                                                InElementOp,
+                                                                                WeiElementOp,
+                                                                                OutElementOp,
+                                                                                NumDimSpatial>;
+
+DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial)
+{
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return std::make_unique<DeviceConvNDFwdInstance<3>>();
+    }
+    case 2: {
+        return std::make_unique<DeviceConvNDFwdInstance<2>>();
+    }
+    case 1: {
+        return std::make_unique<DeviceConvNDFwdInstance<1>>();
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+
+void print_use_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: run kernel # of times (>1)\n"
+              << "arg4: N spatial dimensions (default 2)\n"
+              << "Following arguments (depending on number of spatial dims):\n"
+              << " N, K, C, \n"
+              << " <filter spatial dimensions>, (ie Y, X for 2D)\n"
+              << " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
+              << " <strides>, (ie Sy, Sx for 2D)\n"
+              << " <dilations>, (ie Dy, Dx for 2D)\n"
+              << " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
+              << " <right padding>, (ie RightPy, RightPx for 2D)\n"
+              << std::endl;
+}
+
+ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[])
+{
+    // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
+    int conv_args     = 3 + num_dim_spatial * 6;
+    int cmdline_nargs = conv_args + 5;
+    if(cmdline_nargs != argc)
+    {
+        print_use_msg();
+        exit(0);
+    }
+
+    ck::utils::conv::ConvParams params;
+    int arg_idx = 5;
+
+    params.num_dim_spatial = num_dim_spatial;
+    params.N               = std::stoi(argv[arg_idx++]);
+    params.K               = std::stoi(argv[arg_idx++]);
+    params.C               = std::stoi(argv[arg_idx++]);
+
+    params.filter_spatial_lengths.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.input_spatial_lengths.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.conv_filter_strides.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.conv_filter_dilations.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.input_left_pads.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+    }
+    params.input_right_pads.resize(num_dim_spatial);
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+    }
+
+    return params;
+}
+
+} // anonymous namespace
+
+int main(int argc, char* argv[])
+{
+    using namespace ck::utils::conv;
+
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+    int num_dim_spatial  = 2;
+
+    ck::utils::conv::ConvParams params;
+
+    if(argc >= 5)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+        num_dim_spatial = std::stoi(argv[4]);
+    }
+
+    if(argc >= 6)
+    {
+        params = parse_conv_params(num_dim_spatial, argc, argv);
+    }
+
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
+                                        static_cast<std::size_t>(params.C)};
+    input_dims.insert(std::end(input_dims),
+                      std::begin(params.input_spatial_lengths),
+                      std::end(params.input_spatial_lengths));
+
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
+                                         static_cast<std::size_t>(params.C)};
+    filter_dims.insert(std::end(filter_dims),
+                       std::begin(params.filter_spatial_lengths),
+                       std::end(params.filter_spatial_lengths));
+
+    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
+                                         static_cast<std::size_t>(params.K)};
+    output_dims.insert(std::end(output_dims),
+                       std::begin(output_spatial_lengths),
+                       std::end(output_spatial_lengths));
+
+    Tensor<InDataType> input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial));
+    Tensor<WeiDataType> weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial));
+    Tensor<OutDataType> host_output(
+        get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
+    Tensor<OutDataType> device_output(
+        get_output_host_tensor_descriptor(output_dims, num_dim_spatial));
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weights: " << weights.mDesc << std::endl;
+    std::cout << "output: " << host_output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    case 2:
+        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        break;
+    default:
+        // input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        input.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
+        // weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
+
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weights.mData.data());
+
+    // do GEMM
+    auto conv    = get_conv_instance(num_dim_spatial);
+    auto invoker = conv->MakeInvokerPointer();
+    auto argument =
+        conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                  static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                  static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                  params.N,
+                                  params.K,
+                                  params.C,
+                                  params.input_spatial_lengths,
+                                  params.filter_spatial_lengths,
+                                  output_spatial_lengths,
+                                  params.conv_filter_strides,
+                                  params.conv_filter_dilations,
+                                  params.input_left_pads,
+                                  params.input_right_pads,
+                                  InElementOp{},
+                                  WeiElementOp{},
+                                  OutElementOp{});
+
+    if(!conv->IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float ave_time = invoker->Run(argument.get(), nrepeat);
+
+    std::size_t flop = get_flops(
+        params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+    std::size_t num_btype =
+        get_btype<InDataType, WeiDataType, OutDataType>(params.N,
+                                                        params.C,
+                                                        params.K,
+                                                        params.input_spatial_lengths,
+                                                        params.filter_spatial_lengths,
+                                                        output_spatial_lengths);
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        auto verify_f = [&input, &weights, &host_output, &params, &out_device_buf, &device_output](
+                            const auto& ref_conv) {
+            auto ref_invoker  = ref_conv.MakeInvoker();
+            auto ref_argument = ref_conv.MakeArgument(input,
+                                                      weights,
+                                                      host_output,
+                                                      params.conv_filter_strides,
+                                                      params.conv_filter_dilations,
+                                                      params.input_left_pads,
+                                                      params.input_right_pads,
+                                                      InElementOp{},
+                                                      WeiElementOp{},
+                                                      OutElementOp{});
+
+            ref_invoker.Run(ref_argument);
+            out_device_buf.FromDevice(device_output.mData.data());
+            ck::utils::check_err(
+                host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+        };
+
+        switch(num_dim_spatial)
+        {
+        case 3: {
+            auto ref_conv = ReferenceConvNDFwdInstance<3>();
+            verify_f(ref_conv);
+            break;
+        }
+        case 2: {
+            auto ref_conv = ReferenceConvNDFwdInstance<2>();
+            verify_f(ref_conv);
+            break;
+        }
+        case 1: {
+            auto ref_conv = ReferenceConvNDFwdInstance<1>();
+            verify_f(ref_conv);
+            break;
+        }
+        default: {
+            throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+        }
+        }
+    }
+}

From ef77a1cacf25da5d385e2cff92eb1eea8c59f19f Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Sun, 24 Apr 2022 10:54:02 +0000
Subject: [PATCH 09/32] format log out

---
 example/01_gemm/gemm_xdl_fp64.cpp             | 34 ++++++++++++++-----
 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp |  8 ++---
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |  2 +-
 .../cpu/reference_gemm.hpp                    |  4 +--
 4 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 625d11d9459..bed6b9b1575 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -47,13 +47,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
 #if 1
-             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>;
+             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   64,    32,    32,     4,  1,   16,   16,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,               7,               1>;
 using ADataType   = double;
 using BDataType   = double;
 using CDataType   = double;
 using AccDataType = double;
 #else
-            <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>;
+            <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   64,    32,    32,     4,  1,   16,   16,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,               7,               1>;
 using ADataType   = float;
 using BDataType   = float;
 using CDataType   = float;
@@ -64,6 +64,23 @@ using AccDataType = float;
 using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
+template <typename DataType>
+std::ostream& void show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
+{
+    os << "[" << std::endl;
+    for(int x = 0; x < matrix.mDesc.GetLengths()[0]; x++)
+    {
+        os << "[";
+        for(int y = 0; y < matrix.mDesc.GetLengths()[1]; y++)
+        {
+            os << std::setw(4) << static_cast<float>(matrix(x, y));
+        }
+        os << "]" << std::endl;
+    }
+    os << "]";
+    return os;
+}
+
 int main(int argc, char* argv[])
 {
     bool do_verification = 0;
@@ -144,8 +161,8 @@ int main(int argc, char* argv[])
         break;
     default:
         // a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
         a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
         // b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
     }
 
@@ -208,13 +225,12 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-#if 0
+#if 1
         {
-            LogRangeAsType<AccDataType>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-            LogRangeAsType<AccDataType>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-            LogRangeAsType<AccDataType>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl;
-            LogRangeAsType<AccDataType>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
-                << std::endl;
+            show_2d_matrix(std::cout << "a : ", a_m_k) << std::endl;
+            show_2d_matrix(std::cout << "b: ", b_k_n) << std::endl;
+            show_2d_matrix(std::cout << "c_device: ", c_m_n_device_result) << std::endl;
+            show_2d_matrix(std::cout << "c_host  :", c_m_n_host_result) << std::endl;
         }
 #endif
         ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
index e933dd3dc56..8c4008b36d1 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -49,14 +49,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::
         ConvFwdDefault,     // ConvForwardSpecialization
         NumDimSpatial,      // NumDimSpatial
         256,                // BlockSize
-        128,                // MPerBlock
-        128,                // NPerBlock
+        64,                // MPerBlock
+        64,                // NPerBlock
         4,                  // K0PerBlock
         2,                  // K1
         16,                 // MPerXDL
         16,                 // NPerXDL
-        4,                  // MXdlPerWave
-        4,                  // NXdlPerWave
+        2,                  // MXdlPerWave
+        2,                  // NXdlPerWave
         S<4, 64, 1>,        // ABlockTransferThreadClusterLengths_K0_M_K1
         S<1, 0, 2>,         // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,         // ABlockTransferSrcAccessOrder
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 4d9a40e9547..54916c2b433 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -25,7 +25,7 @@ enum struct MfmaInstr
     mfma_f32_16x16x8bf16,
     mfma_i32_32x32x8i8,
     mfma_i32_16x16x16i8,
-    mfma_f64_16x16x4f64,
+    mfma_f64_16x16x4f64
 };
 
 template <MfmaInstr instr>
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index e659f2d0e82..1326583bf62 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -63,8 +63,8 @@ struct ReferenceGemm : public device::BaseOperator
                     AccDataType v_a;
                     AccDataType v_b;
 
-                    arg.a_element_op_(v_a, static_cast<const float>(arg.a_m_k_(m, k)));
-                    arg.b_element_op_(v_b, static_cast<const float>(arg.b_k_n_(k, n)));
+                    arg.a_element_op_(v_a, static_cast<const AccDataType>(arg.a_m_k_(m, k)));
+                    arg.b_element_op_(v_b, static_cast<const AccDataType>(arg.b_k_n_(k, n)));
 
                     v_acc += v_a * v_b;
                 }

From 7e8e54dead53f981b7e562c1eed6e80cb8702aac Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Sun, 24 Apr 2022 12:36:26 +0000
Subject: [PATCH 10/32] change rc matrix egister layout

---
 example/01_gemm/gemm_xdl_fp64.cpp                    | 2 +-
 include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index bed6b9b1575..54a64d6eb5b 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -65,7 +65,7 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
     ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 template <typename DataType>
-std::ostream& void show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
+std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
 {
     os << "[" << std::endl;
     for(int x = 0; x < matrix.mDesc.GetLengths()[0]; x++)
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 54916c2b433..a39b795818e 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -387,8 +387,8 @@ struct mfma_type<MfmaInstr::mfma_i32_16x16x16i8>
 template <>
 struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
 {
-    static constexpr index_t group_size          = 4;
-    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t group_size          = 1;
+    static constexpr index_t num_groups_per_blk  = 4;
     static constexpr index_t num_regs_per_blk    = 4; // group_size * num_groups_per_blk;
     static constexpr index_t num_threads_per_blk = 16;
     static constexpr index_t wave_size           = 64;

From 3991a1c132cd2cd51bbe074a02b0b5a4da6bc6b2 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Mon, 25 Apr 2022 03:22:03 +0000
Subject: [PATCH 11/32] reorganize example

---
 example/01_gemm/gemm_xdl_fp64.cpp             | 32 +++++++++++--------
 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 12 +++----
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 54a64d6eb5b..94cad6c4756 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -46,7 +46,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
 //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-#if 1
+#if 0
              <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   64,    32,    32,     4,  1,   16,   16,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,               7,               1>;
 using ADataType   = double;
 using BDataType   = double;
@@ -59,10 +59,15 @@ using BDataType   = float;
 using CDataType   = float;
 using AccDataType = float;
 #endif
-// clang-format on
+    // clang-format on
 
-using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+    using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                            BDataType,
+                                                                            CDataType,
+                                                                            AccDataType,
+                                                                            AElementOp,
+                                                                            BElementOp,
+                                                                            CElementOp>;
 
 template <typename DataType>
 std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
@@ -88,13 +93,13 @@ int main(int argc, char* argv[])
     int nrepeat          = 5;
 
     // GEMM shape
-    ck::index_t M = 3840;
-    ck::index_t N = 4096;
-    ck::index_t K = 4096;
+    ck::index_t M = 32;
+    ck::index_t N = 32;
+    ck::index_t K = 4;
 
-    ck::index_t StrideA = 4096;
-    ck::index_t StrideB = 4096;
-    ck::index_t StrideC = 4096;
+    ck::index_t StrideA = 4;
+    ck::index_t StrideB = 4;
+    ck::index_t StrideC = 32;
 
     if(argc == 4)
     {
@@ -144,6 +149,7 @@ int main(int argc, char* argv[])
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
     Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
 
+    std::cout << "data type: " << typeid(ADataType{}).name() << std::endl;
     std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
     std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
@@ -160,10 +166,8 @@ int main(int argc, char* argv[])
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
         break;
     default:
-        // a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
         a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
-        // b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
     }
 
     DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
@@ -225,7 +229,7 @@ int main(int argc, char* argv[])
 
         ref_invoker.Run(ref_argument);
 
-#if 1
+#if 0
         {
             show_2d_matrix(std::cout << "a : ", a_m_k) << std::endl;
             show_2d_matrix(std::cout << "b: ", b_k_n) << std::endl;
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
index 8c4008b36d1..cdda09db42e 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -49,14 +49,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::
         ConvFwdDefault,     // ConvForwardSpecialization
         NumDimSpatial,      // NumDimSpatial
         256,                // BlockSize
-        64,                // MPerBlock
-        64,                // NPerBlock
+        128,                // MPerBlock
+        128,                // NPerBlock
         4,                  // K0PerBlock
         2,                  // K1
         16,                 // MPerXDL
         16,                 // NPerXDL
-        2,                  // MXdlPerWave
-        2,                  // NXdlPerWave
+        4,                  // MXdlPerWave
+        4,                  // NXdlPerWave
         S<4, 64, 1>,        // ABlockTransferThreadClusterLengths_K0_M_K1
         S<1, 0, 2>,         // ABlockTransferThreadClusterArrangeOrder
         S<1, 0, 2>,         // ABlockTransferSrcAccessOrder
@@ -241,10 +241,8 @@ int main(int argc, char* argv[])
         weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
         break;
     default:
-        // input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
         input.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
-        // weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+        weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
     }
 
     DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());

From ea970541d0e125308a068d712974e71afce25d81 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Mon, 25 Apr 2022 03:56:53 +0000
Subject: [PATCH 12/32] reorganize example 2

---
 example/01_gemm/gemm_xdl_fp64.cpp | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 94cad6c4756..303c0e3b3b2 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -25,6 +25,11 @@ using F64 = double;
 using F32 = float;
 using F16 = ck::half_t;
 
+using ADataType   = double;
+using BDataType   = double;
+using CDataType   = double;
+using AccDataType = double;
+
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
@@ -46,28 +51,11 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
 //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-#if 0
              <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   64,    32,    32,     4,  1,   16,   16,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,               7,               1>;
-using ADataType   = double;
-using BDataType   = double;
-using CDataType   = double;
-using AccDataType = double;
-#else
-            <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   64,    32,    32,     4,  1,   16,   16,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,               7,               1>;
-using ADataType   = float;
-using BDataType   = float;
-using CDataType   = float;
-using AccDataType = float;
-#endif
-    // clang-format on
-
-    using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                                            BDataType,
-                                                                            CDataType,
-                                                                            AccDataType,
-                                                                            AElementOp,
-                                                                            BElementOp,
-                                                                            CElementOp>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 template <typename DataType>
 std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)

From b615d65336d6c48a39234697da5afd38ccd68c9f Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Mon, 25 Apr 2022 09:22:51 +0000
Subject: [PATCH 13/32] format,because merge develop

---
 library/src/utility/conv_fwd_util.cpp | 35 +++++++++++++--------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/library/src/utility/conv_fwd_util.cpp b/library/src/utility/conv_fwd_util.cpp
index fde2caa56b3..16584503887 100644
--- a/library/src/utility/conv_fwd_util.cpp
+++ b/library/src/utility/conv_fwd_util.cpp
@@ -37,16 +37,16 @@ std::size_t get_flops(ck::index_t N,
 }
 
 ConvParams::ConvParams()
-        : num_dim_spatial(2),
-          N(128),
-          K(256),
-          C(192),
-          filter_spatial_lengths(2, 3),
-          input_spatial_lengths(2, 71),
-          conv_filter_strides(2, 2),
-          conv_filter_dilations(2, 1),
-          input_left_pads(2, 1),
-          input_right_pads(2, 1)
+    : num_dim_spatial(2),
+      N(128),
+      K(256),
+      C(192),
+      filter_spatial_lengths(2, 3),
+      input_spatial_lengths(2, 71),
+      conv_filter_strides(2, 2),
+      conv_filter_dilations(2, 1),
+      input_left_pads(2, 1),
+      input_right_pads(2, 1)
 {
 }
 
@@ -77,9 +77,9 @@ ConvParams::ConvParams(ck::index_t n_dim,
        conv_filter_dilations.size() != num_dim_spatial ||
        input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
     {
-        throw(std::runtime_error(
-            "ConvParams::GetOutputSpatialLengths: "
-            "parameter size is different from number of declared dimensions!"));
+        throw(
+            std::runtime_error("ConvParams::GetOutputSpatialLengths: "
+                               "parameter size is different from number of declared dimensions!"));
     }
 }
 
@@ -91,9 +91,9 @@ std::vector<ck::index_t> ConvParams::GetOutputSpatialLengths() const
        conv_filter_dilations.size() != num_dim_spatial ||
        input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial)
     {
-        throw(std::runtime_error(
-            "ConvParams::GetOutputSpatialLengths: "
-            "parameter size is different from number of declared dimensions!"));
+        throw(
+            std::runtime_error("ConvParams::GetOutputSpatialLengths: "
+                               "parameter size is different from number of declared dimensions!"));
     }
 
     std::vector<ck::index_t> out_spatial_len(num_dim_spatial, 0);
@@ -101,8 +101,7 @@ std::vector<ck::index_t> ConvParams::GetOutputSpatialLengths() const
     {
         // XEff = (X - 1) * conv_dilation_w + 1;
         // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-        const ck::index_t idx_eff =
-            (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1;
+        const ck::index_t idx_eff = (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1;
         out_spatial_len[i] =
             (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) /
                 conv_filter_strides[i] +

From 6bf08bb91a7f681e6d2f8c74161fa8b7a9921128 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Tue, 26 Apr 2022 00:16:35 +0000
Subject: [PATCH 14/32] fix call impl adding acc data type

---
 profiler/src/profile_grouped_gemm.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
index 88a2a8f855d..efa4b9e0f89 100644
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
     if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
         ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                 ck::half_t,
                                                 ck::half_t,
                                                 ck::tensor_layout::gemm::RowMajor,
@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
     else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                 ck::half_t,
                                                 ck::half_t,
                                                 ck::tensor_layout::gemm::RowMajor,
@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
     else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
     {
         ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                 ck::half_t,
                                                 ck::half_t,
                                                 ck::tensor_layout::gemm::ColumnMajor,
@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
     else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
     {
         ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                 ck::half_t,
                                                 ck::half_t,
                                                 ck::tensor_layout::gemm::ColumnMajor,

From 2959f94c7e93c00aed8a2e37aac6d72e9da8ae69 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Tue, 26 Apr 2022 02:29:58 +0000
Subject: [PATCH 15/32] lost ;

---
 test/gemm/gemm_fp32.cpp | 6 +++---
 test/gemm/gemm_int8.cpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp
index 72ff08f427f..89701511dc6 100644
--- a/test/gemm/gemm_fp32.cpp
+++ b/test/gemm/gemm_fp32.cpp
@@ -56,10 +56,10 @@ int main()
     using ADataType   = float;
     using BDataType   = float;
     using CDataType   = float;
-    using AccDataType = float,
+    using AccDataType = float;
 
-          using RowMajor = ck::tensor_layout::gemm::RowMajor;
-    using ColumnMajor    = ck::tensor_layout::gemm::ColumnMajor;
+    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
 
     bool res = true;
     std::vector<DeviceGemmNoOpPtr> gemmPtrs;
diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp
index cc38da757a0..de18dee94ff 100644
--- a/test/gemm/gemm_int8.cpp
+++ b/test/gemm/gemm_int8.cpp
@@ -49,10 +49,10 @@ int main()
     using ADataType   = int8_t;
     using BDataType   = int8_t;
     using CDataType   = int8_t;
-    using AccDataType = int32_t,
+    using AccDataType = int32_t;
 
-          using RowMajor = ck::tensor_layout::gemm::RowMajor;
-    using ColumnMajor    = ck::tensor_layout::gemm::ColumnMajor;
+    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
 
     std::vector<DeviceGemmNoOpPtr> gemmPtrs;
     bool res = true;

From bc7b53348b81a25b1122480739de63778a4cda08 Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Tue, 26 Apr 2022 16:25:33 +0800
Subject: [PATCH 16/32] add compiler warning

---
 include/ck/utility/amd_xdlops.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index e4f24ee2d5a..5f305a49297 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -307,8 +307,10 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
         reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
             reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
 #else
-        reg_c.template AsType<double4_t>()(Number<0>{}) = {reg_a, reg_a, reg_b, reg_b};
-
+#pragma message "this GPU card don't support mfma_f64_16x16x4f64 instruction!"
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
 #endif
     }
 };

From 85ef3f285864bd6843e32ae7badd1a66c37dca17 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Fri, 29 Apr 2022 04:40:54 +0000
Subject: [PATCH 17/32] change example tunning parameters

---
 example/01_gemm/gemm_xdl_fp64.cpp | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index 303c0e3b3b2..eb0776b3620 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -51,11 +51,20 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
 //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
 //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+#if 0
              <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   64,    32,    32,     4,  1,   16,   16,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,               7,               1>;
-// clang-format on
+#else
+             <  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,  256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
+#endif
+    // clang-format on
 
-using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+    using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                            BDataType,
+                                                                            CDataType,
+                                                                            AccDataType,
+                                                                            AElementOp,
+                                                                            BElementOp,
+                                                                            CElementOp>;
 
 template <typename DataType>
 std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
@@ -81,13 +90,13 @@ int main(int argc, char* argv[])
     int nrepeat          = 5;
 
     // GEMM shape
-    ck::index_t M = 32;
-    ck::index_t N = 32;
-    ck::index_t K = 4;
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
 
-    ck::index_t StrideA = 4;
-    ck::index_t StrideB = 4;
-    ck::index_t StrideC = 32;
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
 
     if(argc == 4)
     {

From 4a77f4536cdf07883a2d0c6edbe19193bc685dba Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Fri, 29 Apr 2022 17:52:15 +0800
Subject: [PATCH 18/32] add test for fp64

---
 .../gpu/gemm/CMakeLists.txt                        |  4 ++++
 test/gemm/CMakeLists.txt                           |  4 ++++
 test/gemm/gemm_util.hpp                            | 14 ++++++++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
index 5f057adcc5f..74527b1e265 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -1,5 +1,9 @@
 # device_gemm_instance
 set(DEVICE_GEMM_INSTANCE_SOURCE
+   device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp;
+   device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp;
+   device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp;
+   device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp;
    device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp;
    device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp;
    device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp;
diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt
index 83b3c1e2e30..6bb4d3c723d 100644
--- a/test/gemm/CMakeLists.txt
+++ b/test/gemm/CMakeLists.txt
@@ -1,3 +1,7 @@
+add_test_executable(test_gemm_fp64 gemm_fp64.cpp)
+target_link_libraries(test_gemm_fp64 PRIVATE host_tensor)
+target_link_libraries(test_gemm_fp64 PRIVATE device_gemm_instance)
+
 add_test_executable(test_gemm_fp32 gemm_fp32.cpp)
 target_link_libraries(test_gemm_fp32 PRIVATE host_tensor)
 target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance)
diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index f8de85ea883..1be24e0fba5 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -140,12 +140,12 @@ struct TestGemm
         Tensor<CDataType> c_m_n_device_result(
             f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
 
-        auto f_generate_tensor_value = [](auto desc, auto type) {
+        auto f_generate_tensor_value = [](auto& desc, auto type) {
             using dataType = decltype(type);
 
-            if(std::is_same<dataType, int8_t>::value)
+            if(std::is_same<dataType, int8_t>::value || std::is_same<dataType, double>::value)
             {
-                desc.GenerateTensorValue(GeneratorTensor_2<int8_t>{-5, 5});
+                desc.GenerateTensorValue(GeneratorTensor_2<dataType>{-5, 5});
             }
             else
             {
@@ -161,6 +161,7 @@ struct TestGemm
 
     auto operator()(DeviceGemmPtr_& gemmPtr)
     {
+        std::cout << "data type: " << typeid(ADataType{}).name() << std::endl;
         std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
                   << ", CLayout = " << CLayout{}.name << std::endl;
         std::cout << gemmPtr->GetTypeString() << std::endl;
@@ -202,7 +203,12 @@ struct TestGemm
 
         // Assert
         bool res = false;
-        if(std::is_same<CDataType, float>::value)
+        if(std::is_same<CDataType, double>::value)
+        {
+            res = ck::utils::check_err(c_device.mData, c_host.mData);
+            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+        }
+        else if(std::is_same<CDataType, float>::value)
         {
             res = ck::utils::check_err(c_device.mData, c_host.mData);
             std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;

From 75ef75beb96db75735046978572b5b9654f7e16d Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Fri, 29 Apr 2022 17:55:12 +0800
Subject: [PATCH 19/32] add instance

---
 ...gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp | 50 +++++++++++++++++
 ...gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp | 50 +++++++++++++++++
 ...gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp | 50 +++++++++++++++++
 ...gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp | 55 +++++++++++++++++++
 4 files changed, 205 insertions(+)
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp

diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
new file mode 100644
index 00000000000..7b19e7c38c2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
@@ -0,0 +1,50 @@
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_gemm_xdl.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  2,   16,   16,    8,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_km_kn_mn_instances{});
+}
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
new file mode 100644
index 00000000000..d903030cc49
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
@@ -0,0 +1,50 @@
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_gemm_xdl.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_f64_f64_f64_km_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  2,   16,   16,    8,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_km_nk_mn_instances{});
+}
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000000..f021f018da0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
@@ -0,0 +1,50 @@
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_gemm_xdl.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  2,   16,   16,    8,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances{});
+}
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000000..e34022fca3e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
@@ -0,0 +1,55 @@
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_gemm_xdl.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  2,   16,   16,    8,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    64,     4,  2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    32,     4,  2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(
+    std::vector<DeviceGemmPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances{});
+}
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck

From 94fad45b0b21b60f420e55113fb335be7caea355 Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Fri, 29 Apr 2022 17:56:53 +0800
Subject: [PATCH 20/32] add test/gemm/gemm_fp64.cpp

---
 test/gemm/gemm_fp64.cpp | 136 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 test/gemm/gemm_fp64.cpp

diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp
new file mode 100644
index 00000000000..fa95d1d6380
--- /dev/null
+++ b/test/gemm/gemm_fp64.cpp
@@ -0,0 +1,136 @@
+#include <algorithm>
+#include <cstdlib>
+#include <half.hpp>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "gemm_util.hpp"
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_xdl.hpp"
+#include "device_gemm_xdl_c_shuffle.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using DeviceGemmNoOpPtr =
+    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough>;
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+int main()
+{
+#ifdef __gfx90a__
+    using ADataType   = double;
+    using BDataType   = double;
+    using CDataType   = double;
+    using AccDataType = double;
+
+    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
+
+    bool res = true;
+    std::vector<DeviceGemmNoOpPtr> gemmPtrs;
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(gemmPtrs);
+
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(gemmPtrs);
+
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(gemmPtrs);
+
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(gemmPtrs);
+
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+#else
+    bool res = true;
+#endif
+    std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    return res ? 0 : 1;
+}

From 1286072ae2a3a5fbfd887af7887ed8b873713cc3 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Fri, 29 Apr 2022 11:07:25 +0000
Subject: [PATCH 21/32] fix get name issue

---
 test/gemm/gemm_fp64.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp
index 28e5a453976..7714290b87d 100644
--- a/test/gemm/gemm_fp64.cpp
+++ b/test/gemm/gemm_fp64.cpp
@@ -63,7 +63,7 @@ inline std::string get_device_name()
 
 int main()
 {
-    if(get_device_name() != "gfx90a")
+    if(get_device_name().find("gfx90a") == std::string::npos)
     {
         std::cout << "TestGemm ..... SUCCESS" << std::endl;
         return 0;

From 04397fa0736f7ae20c9b94b85a818a6528630e29 Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Fri, 29 Apr 2022 19:11:22 +0800
Subject: [PATCH 22/32] remove some tunning parameter

---
 include/ck/utility/amd_xdlops.hpp             |  1 -
 ...gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp |  1 -
 ...gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp |  1 -
 ...gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp |  1 -
 ...gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp |  1 -
 test/gemm/gemm_fp64.cpp                       | 29 ++++++++++++++++---
 6 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index 5f305a49297..47ab8b3b4d9 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -307,7 +307,6 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
         reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
             reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
 #else
-#pragma message "this GPU card don't support mfma_f64_16x16x4f64 instruction!"
         ignore = reg_a;
         ignore = reg_b;
         ignore = reg_c;
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
index 7b19e7c38c2..fdc85dfc710 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
@@ -29,7 +29,6 @@ using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances =
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  2,   16,   16,    8,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
index d903030cc49..e400cd9bbba 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
@@ -29,7 +29,6 @@ using device_gemm_xdl_f64_f64_f64_km_nk_mn_instances =
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  2,   16,   16,    8,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
index f021f018da0..2f9241b93b3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
@@ -29,7 +29,6 @@ using device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances =
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  2,   16,   16,    8,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
index e34022fca3e..537fe2bdae7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
@@ -29,7 +29,6 @@ using device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances =
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  2,   16,   16,    8,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
         DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp
index fa95d1d6380..28e5a453976 100644
--- a/test/gemm/gemm_fp64.cpp
+++ b/test/gemm/gemm_fp64.cpp
@@ -41,9 +41,33 @@ void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(std::vector<DeviceGemmNo
 } // namespace tensor_operation
 } // namespace ck
 
+inline std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    const std::string name(props.gcnArchName);
+
+    return name;
+}
+
 int main()
 {
-#ifdef __gfx90a__
+    if(get_device_name() != "gfx90a")
+    {
+        std::cout << "TestGemm ..... SUCCESS" << std::endl;
+        return 0;
+    }
     using ADataType   = double;
     using BDataType   = double;
     using CDataType   = double;
@@ -128,9 +152,6 @@ int main()
                                        PassThrough,
                                        PassThrough>{}(gemmPtr);
     }
-#else
-    bool res = true;
-#endif
     std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
     return res ? 0 : 1;
 }

From 345acac1e974881d3e8fc34ef0be850fe1bab1b0 Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Fri, 29 Apr 2022 19:47:56 +0800
Subject: [PATCH 23/32] fix conflict

---
 test/gemm/gemm_util.hpp | 343 ----------------------------------------
 1 file changed, 343 deletions(-)

diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 75f01a35a8e..1be24e0fba5 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -1,4 +1,3 @@
-<<<<<<< HEAD
 #ifndef GEMM_UTILS_HPP
 #define GEMM_UTILS_HPP
 
@@ -348,345 +347,3 @@ struct TestGemmBF16
 } // namespace gemm_util
 } // namespace ck
 #endif
-=======
-#ifndef GEMM_UTILS_HPP
-#define GEMM_UTILS_HPP
-
-#include "check_err.hpp"
-#include "config.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "reference_gemm.hpp"
-#include "tensor_layout.hpp"
-
-namespace ck {
-namespace gemm_util {
-
-struct GemmParams
-{
-    GemmParams()
-        : M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0)
-    {
-    }
-
-    ck::index_t M;
-    ck::index_t N;
-    ck::index_t K;
-
-    ck::index_t StrideA;
-    ck::index_t StrideB;
-    ck::index_t StrideC;
-
-    float alpha;
-    float beta;
-};
-
-template <typename GemmInstance,
-          typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-void RunHostGEMM(const Tensor<ADataType>& A,
-                 const Tensor<BDataType>& B,
-                 Tensor<CDataType>& C,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CElementwiseOperation c_element_op)
-{
-    auto ref_gemm    = GemmInstance{};
-    auto ref_invoker = ref_gemm.MakeInvoker();
-
-    auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
-
-    ref_invoker.Run(ref_argument);
-}
-
-template <typename DeviceGemmPtr_,
-          typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
-                   const ck::gemm_util::GemmParams& params,
-                   const Tensor<ADataType>& A,
-                   const Tensor<BDataType>& B,
-                   Tensor<CDataType>& C,
-                   AElementwiseOperation a_element_op,
-                   BElementwiseOperation b_element_op,
-                   CElementwiseOperation c_element_op)
-{
-    DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace());
-    DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
-    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
-
-    a_m_k_device_buf.ToDevice(A.mData.data());
-    b_k_n_device_buf.ToDevice(B.mData.data());
-
-    auto invoker_ptr = gemmPtr->MakeInvokerPointer();
-    auto argument_ptr =
-        gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
-                                     static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
-                                     static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
-                                     params.M,
-                                     params.N,
-                                     params.K,
-                                     params.StrideA,
-                                     params.StrideB,
-                                     params.StrideC,
-                                     a_element_op,
-                                     b_element_op,
-                                     c_element_op);
-
-    if(!gemmPtr->IsSupportedArgument(argument_ptr.get()))
-    {
-        throw std::runtime_error(
-            "wrong! device_gemm with the specified compilation parameters does "
-            "not support this GEMM problem");
-    }
-
-    invoker_ptr->Run(argument_ptr.get());
-    c_m_n_device_buf.FromDevice(C.mData.data());
-}
-
-template <typename DeviceGemmPtr_,
-          typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct TestGemm
-{
-    auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
-    {
-        auto f_host_tensor_descriptor =
-            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({stride, 1}));
-                }
-                else
-                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({1, stride}));
-                }
-            };
-
-        Tensor<ADataType> a_m_k(
-            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-        Tensor<BDataType> b_k_n(
-            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
-        Tensor<CDataType> c_m_n_host_result(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-        Tensor<CDataType> c_m_n_device_result(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-
-        auto f_generate_tensor_value = [](auto& desc, auto type) {
-            using dataType = decltype(type);
-
-            if(std::is_same<dataType, int8_t>::value)
-            {
-                desc.GenerateTensorValue(GeneratorTensor_2<int8_t>{-5, 5});
-            }
-            else
-            {
-                desc.GenerateTensorValue(GeneratorTensor_3<dataType>{-0.5, 0.5});
-            }
-        };
-
-        f_generate_tensor_value(a_m_k, ADataType{});
-        f_generate_tensor_value(b_k_n, BDataType{});
-
-        return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result);
-    }
-
-    auto operator()(DeviceGemmPtr_& gemmPtr)
-    {
-        std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
-                  << ", CLayout = " << CLayout{}.name << std::endl;
-        std::cout << gemmPtr->GetTypeString() << std::endl;
-
-        // Arrange
-        ck::gemm_util::GemmParams params;
-        params.M       = 1024;
-        params.N       = 1024;
-        params.K       = 1024;
-        params.StrideA = 1024;
-        params.StrideB = 1024;
-        params.StrideC = 1024;
-
-        auto host_tensors = PrepareGemmTensor(params);
-
-        const Tensor<ADataType>& a  = std::get<0>(host_tensors);
-        const Tensor<BDataType>& b  = std::get<1>(host_tensors);
-        Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
-        Tensor<CDataType>& c_device = std::get<3>(host_tensors);
-
-        auto a_element_op = AElementwiseOperation{};
-        auto b_element_op = BElementwiseOperation{};
-        auto c_element_op = CElementwiseOperation{};
-
-        using ReferenceGemmInstance =
-            ck::tensor_operation::host::ReferenceGemm<ADataType,
-                                                      BDataType,
-                                                      CDataType,
-                                                      AElementwiseOperation,
-                                                      BElementwiseOperation,
-                                                      CElementwiseOperation>;
-        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
-            a, b, c_host, a_element_op, b_element_op, c_element_op);
-
-        // Act
-        ck::gemm_util::RunDeviceGEMM(
-            gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
-
-        // Assert
-        bool res = false;
-        if(std::is_same<CDataType, float>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
-        else if(std::is_same<CDataType, ck::half_t>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
-        else if(std::is_same<CDataType, int8_t>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
-
-        return res;
-    }
-};
-
-template <typename DeviceGemmPtr_,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct TestGemmBF16
-{
-    using BF16 = ck::bhalf_t;
-
-    auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params)
-    {
-        auto f_host_tensor_descriptor =
-            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
-                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({stride, 1}));
-                }
-                else
-                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
-                                                std::vector<std::size_t>({1, stride}));
-                }
-            };
-
-        // use fp32 host kernel to verify bf16 device kernel
-        Tensor<BF16> a_m_k_bf16(
-            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-        Tensor<BF16> b_k_n_bf16(
-            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
-        Tensor<BF16> c_m_n_device_bf16(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-
-        Tensor<float> a_m_k_fp32(
-            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
-        Tensor<float> b_k_n_fp32(
-            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
-        Tensor<float> c_m_n_host_fp32(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-        Tensor<float> c_m_n_device_fp32(
-            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
-
-        a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
-        b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3<BF16>{-0.5, 0.5});
-
-        bf16_to_f32_(a_m_k_bf16, a_m_k_fp32);
-        bf16_to_f32_(b_k_n_bf16, b_k_n_fp32);
-
-        return std::make_tuple(a_m_k_bf16,
-                               b_k_n_bf16,
-                               c_m_n_device_bf16,
-                               a_m_k_fp32,
-                               b_k_n_fp32,
-                               c_m_n_host_fp32,
-                               c_m_n_device_fp32);
-    }
-
-    auto operator()(DeviceGemmPtr_& gemmPtr)
-    {
-        // Arrange
-        ck::gemm_util::GemmParams params;
-        params.M       = 1024;
-        params.N       = 1024;
-        params.K       = 1024;
-        params.StrideA = 1024;
-        params.StrideB = 1024;
-        params.StrideC = 1024;
-
-        auto host_tensors            = PrepareGemmTensorBF16(params);
-        const Tensor<BF16>& a_bf16   = std::get<0>(host_tensors);
-        const Tensor<BF16>& b_bf16   = std::get<1>(host_tensors);
-        Tensor<BF16>& c_device_bf16  = std::get<2>(host_tensors);
-        Tensor<float>& a_fp32        = std::get<3>(host_tensors);
-        Tensor<float>& b_fp32        = std::get<4>(host_tensors);
-        Tensor<float>& c_host_fp32   = std::get<5>(host_tensors);
-        Tensor<float>& c_device_fp32 = std::get<6>(host_tensors);
-
-        auto a_element_op = AElementwiseOperation{};
-        auto b_element_op = BElementwiseOperation{};
-        auto c_element_op = CElementwiseOperation{};
-
-        // use fp32 host kernel to verify bf16 device kernel
-        using ReferenceGemmInstance =
-            ck::tensor_operation::host::ReferenceGemm<float,
-                                                      float,
-                                                      float,
-                                                      AElementwiseOperation,
-                                                      BElementwiseOperation,
-                                                      CElementwiseOperation>;
-        ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
-            a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op);
-
-        // Act
-        ck::gemm_util::RunDeviceGEMM(gemmPtr,
-                                     params,
-                                     a_bf16,
-                                     b_bf16,
-                                     c_device_bf16,
-                                     a_element_op,
-                                     b_element_op,
-                                     c_element_op);
-
-        bf16_to_f32_(c_device_bf16, c_device_fp32);
-
-        // Assert
-        bool res = ck::utils::check_err(
-            c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f);
-        std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-
-        return res;
-    };
-};
-
-} // namespace gemm_util
-} // namespace ck
-#endif
->>>>>>> develop

From 58f4d82119e72da7765940b8b0a47a8c1a068344 Mon Sep 17 00:00:00 2001
From: ltqin <letao.qin@amd.com>
Date: Fri, 29 Apr 2022 20:32:23 +0800
Subject: [PATCH 24/32] format

---
 .../tensor_operation/gpu/device/device_batched_gemm_xdl.hpp   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
index 56ec5a7f2c9..eda68234248 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -385,8 +385,8 @@ struct DeviceBatchedGemmXdl
               c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)},
               c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
               compute_ptr_offset_of_batch_{a_grid_desc_k0_m_k1_.GetElementSpaceSize(),
-                                          b_grid_desc_k0_n_k1_.GetElementSpaceSize(),
-                                          c_grid_desc_m_n_.GetElementSpaceSize()},
+                                           b_grid_desc_k0_n_k1_.GetElementSpaceSize(),
+                                           c_grid_desc_m_n_.GetElementSpaceSize()},
               block_2_ctile_map_{},
               M01_{M01},
               N01_{N01},

From 579e8e7691256d66955f9fb8417a12289a2325ae Mon Sep 17 00:00:00 2001
From: Chao Liu <chao.liu2@amd.com>
Date: Sat, 30 Apr 2022 00:16:31 +0000
Subject: [PATCH 25/32] use integer value for GEMM test

---
 test/gemm/gemm_util.hpp | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 5f657a543c3..17e954b7f2c 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -139,17 +139,10 @@ struct TestGemm
         Tensor<CDataType> c_m_n_device_result(
             f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
 
-        auto f_generate_tensor_value = [](auto& desc, auto type) {
+        auto f_generate_tensor_value = [](auto& tensor, auto type) {
             using dataType = decltype(type);
 
-            if(std::is_same<dataType, int8_t>::value)
-            {
-                desc.GenerateTensorValue(GeneratorTensor_2<int8_t>{-5, 5});
-            }
-            else
-            {
-                desc.GenerateTensorValue(GeneratorTensor_3<dataType>{-0.5, 0.5});
-            }
+            tensor.GenerateTensorValue(GeneratorTensor_2<dataType>{-5, 5});
         };
 
         f_generate_tensor_value(a_m_k, ADataType{});

From 9e8cb769bfd72c4c043c5cd9b799e4fa935d3eb5 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Sat, 30 Apr 2022 09:00:04 +0000
Subject: [PATCH 26/32] add acc data type

---
 test/gemm/gemm_util.hpp | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 17e954b7f2c..937ba0571f6 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -106,6 +106,7 @@ template <typename DeviceGemmPtr_,
           typename ADataType,
           typename BDataType,
           typename CDataType,
+          typename AccDataType,
           typename ALayout,
           typename BLayout,
           typename CLayout,
@@ -153,6 +154,7 @@ struct TestGemm
 
     auto operator()(DeviceGemmPtr_& gemmPtr)
     {
+        std::cout << "Data type: " << typeid(CDataType{}).name() << std::endl;
         std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
                   << ", CLayout = " << CLayout{}.name << std::endl;
         std::cout << gemmPtr->GetTypeString() << std::endl;
@@ -181,6 +183,7 @@ struct TestGemm
             ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                       BDataType,
                                                       CDataType,
+                                                      AccDataType,
                                                       AElementwiseOperation,
                                                       BElementwiseOperation,
                                                       CElementwiseOperation>;
@@ -193,21 +196,8 @@ struct TestGemm
 
         // Assert
         bool res = false;
-        if(std::is_same<CDataType, float>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
-        else if(std::is_same<CDataType, ck::half_t>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
-        else if(std::is_same<CDataType, int8_t>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
+        res      = ck::utils::check_err(c_device.mData, c_host.mData);
+        std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
 
         return res;
     }
@@ -299,6 +289,7 @@ struct TestGemmBF16
         // use fp32 host kernel to verify bf16 device kernel
         using ReferenceGemmInstance =
             ck::tensor_operation::host::ReferenceGemm<float,
+                                                      float,
                                                       float,
                                                       float,
                                                       AElementwiseOperation,

From 67de476bba0c30c77980a9d7b210abf0c69d35da Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Sat, 30 Apr 2022 10:36:08 +0000
Subject: [PATCH 27/32] remove typeid because fp16

---
 test/gemm/gemm_util.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 937ba0571f6..fee069b4ddd 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -154,7 +154,6 @@ struct TestGemm
 
     auto operator()(DeviceGemmPtr_& gemmPtr)
     {
-        std::cout << "Data type: " << typeid(CDataType{}).name() << std::endl;
         std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
                   << ", CLayout = " << CLayout{}.name << std::endl;
         std::cout << gemmPtr->GetTypeString() << std::endl;

From 42b0321e113155fc4de20f12c03c92ed60e79ef1 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Wed, 25 May 2022 02:56:02 +0000
Subject: [PATCH 28/32] fix streamconfig etc bug from merging develop

---
 example/01_gemm/gemm_xdl_fp64.cpp             | 11 +--
 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 98 +++++++++----------
 test/gemm/CMakeLists.txt                      |  6 +-
 test/gemm/gemm_util.hpp                       |  6 ++
 .../gemm/{gemm_fp64.cpp => gemm_xdl_fp64.cpp} |  1 -
 5 files changed, 63 insertions(+), 59 deletions(-)
 rename test/gemm/{gemm_fp64.cpp => gemm_xdl_fp64.cpp} (96%)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index eb0776b3620..ff05c3d5417 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -12,7 +12,6 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
 #include "device_gemm_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
 #include "reference_gemm.hpp"
@@ -86,8 +85,8 @@ std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
 int main(int argc, char* argv[])
 {
     bool do_verification = 0;
-    int init_method      = 0;
-    int nrepeat          = 5;
+    int  init_method     = 0;
+    bool time_kernel     = false;
 
     // GEMM shape
     ck::index_t M = 3840;
@@ -102,13 +101,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel         = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel         = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
@@ -201,7 +200,7 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, nrepeat);
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
index cdda09db42e..7149c4f2779 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -5,7 +5,7 @@
 
 #include "check_err.hpp"
 #include "config.hpp"
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "device.hpp"
 #include "device_tensor.hpp"
 #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
@@ -134,40 +134,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
     ck::utils::conv::ConvParams params;
     int arg_idx = 5;
 
-    params.num_dim_spatial = num_dim_spatial;
-    params.N               = std::stoi(argv[arg_idx++]);
-    params.K               = std::stoi(argv[arg_idx++]);
-    params.C               = std::stoi(argv[arg_idx++]);
+    params.num_dim_spatial_ = num_dim_spatial;
+    params.N_               = std::stoi(argv[arg_idx++]);
+    params.K_               = std::stoi(argv[arg_idx++]);
+    params.C_               = std::stoi(argv[arg_idx++]);
 
-    params.filter_spatial_lengths.resize(num_dim_spatial);
+    params.filter_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_spatial_lengths.resize(num_dim_spatial);
+    params.input_spatial_lengths_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_strides.resize(num_dim_spatial);
+    params.conv_filter_strides_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.conv_filter_dilations.resize(num_dim_spatial);
+    params.conv_filter_dilations_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_left_pads.resize(num_dim_spatial);
+    params.input_left_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
     }
-    params.input_right_pads.resize(num_dim_spatial);
+    params.input_right_pads_.resize(num_dim_spatial);
     for(int i = 0; i < num_dim_spatial; ++i)
     {
-        params.input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
     }
 
     return params;
@@ -181,7 +181,7 @@ int main(int argc, char* argv[])
 
     bool do_verification = 0;
     int init_method      = 0;
-    int nrepeat          = 5;
+    bool time_kernel     = false;
     int num_dim_spatial  = 2;
 
     ck::utils::conv::ConvParams params;
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        nrepeat         = std::stoi(argv[3]);
+        time_kernel         = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
     }
 
@@ -199,21 +199,21 @@ int main(int argc, char* argv[])
         params = parse_conv_params(num_dim_spatial, argc, argv);
     }
 
-    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
-                                        static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
+                                        static_cast<std::size_t>(params.C_)};
     input_dims.insert(std::end(input_dims),
-                      std::begin(params.input_spatial_lengths),
-                      std::end(params.input_spatial_lengths));
+                      std::begin(params.input_spatial_lengths_),
+                      std::end(params.input_spatial_lengths_));
 
-    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
-                                         static_cast<std::size_t>(params.C)};
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
+                                         static_cast<std::size_t>(params.C_)};
     filter_dims.insert(std::end(filter_dims),
-                       std::begin(params.filter_spatial_lengths),
-                       std::end(params.filter_spatial_lengths));
+                       std::begin(params.filter_spatial_lengths_),
+                       std::end(params.filter_spatial_lengths_));
 
     const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
-    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
-                                         static_cast<std::size_t>(params.K)};
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
+                                         static_cast<std::size_t>(params.K_)};
     output_dims.insert(std::end(output_dims),
                        std::begin(output_spatial_lengths),
                        std::end(output_spatial_lengths));
@@ -259,16 +259,16 @@ int main(int argc, char* argv[])
         conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                   static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                   static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                  params.N,
-                                  params.K,
-                                  params.C,
-                                  params.input_spatial_lengths,
-                                  params.filter_spatial_lengths,
+                                  params.N_,
+                                  params.K_,
+                                  params.C_,
+                                  params.input_spatial_lengths_,
+                                  params.filter_spatial_lengths_,
                                   output_spatial_lengths,
-                                  params.conv_filter_strides,
-                                  params.conv_filter_dilations,
-                                  params.input_left_pads,
-                                  params.input_right_pads,
+                                  params.conv_filter_strides_,
+                                  params.conv_filter_dilations_,
+                                  params.input_left_pads_,
+                                  params.input_right_pads_,
                                   InElementOp{},
                                   WeiElementOp{},
                                   OutElementOp{});
@@ -280,16 +280,16 @@ int main(int argc, char* argv[])
             "not support this Conv problem");
     }
 
-    float ave_time = invoker->Run(argument.get(), nrepeat);
+    float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
 
     std::size_t flop = get_flops(
-        params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths);
+        params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
     std::size_t num_btype =
-        get_btype<InDataType, WeiDataType, OutDataType>(params.N,
-                                                        params.C,
-                                                        params.K,
-                                                        params.input_spatial_lengths,
-                                                        params.filter_spatial_lengths,
+        get_btype<InDataType, WeiDataType, OutDataType>(params.N_,
+                                                        params.C_,
+                                                        params.K_,
+                                                        params.input_spatial_lengths_,
+                                                        params.filter_spatial_lengths_,
                                                         output_spatial_lengths);
 
     float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -305,10 +305,10 @@ int main(int argc, char* argv[])
             auto ref_argument = ref_conv.MakeArgument(input,
                                                       weights,
                                                       host_output,
-                                                      params.conv_filter_strides,
-                                                      params.conv_filter_dilations,
-                                                      params.input_left_pads,
-                                                      params.input_right_pads,
+                                                      params.conv_filter_strides_,
+                                                      params.conv_filter_dilations_,
+                                                      params.input_left_pads_,
+                                                      params.input_right_pads_,
                                                       InElementOp{},
                                                       WeiElementOp{},
                                                       OutElementOp{});
diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt
index 014b9e5e0cf..6716bbb1b5c 100644
--- a/test/gemm/CMakeLists.txt
+++ b/test/gemm/CMakeLists.txt
@@ -1,7 +1,7 @@
 # GEMM XDL
-add_test_executable(test_gemm_fp64 gemm_fp64.cpp)
-target_link_libraries(test_gemm_fp64 PRIVATE host_tensor)
-target_link_libraries(test_gemm_fp64 PRIVATE device_gemm_instance)
+add_test_executable(test_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
+target_link_libraries(test_gemm_xdl_fp64 PRIVATE host_tensor)
+target_link_libraries(test_gemm_xdl_fp64 PRIVATE device_gemm_instance)
 
 add_test_executable(test_gemm_xdl_fp32 gemm_xdl_fp32.cpp)
 target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor)
diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index e95118351b1..94151f9eed7 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -217,6 +217,12 @@ struct TestGemm
                 res = ck::utils::check_err(c_device.mData, c_host.mData);
                 std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
             }
+            else if(std::is_same<CDataType, double>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+
 
             return res;
         }
diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_xdl_fp64.cpp
similarity index 96%
rename from test/gemm/gemm_fp64.cpp
rename to test/gemm/gemm_xdl_fp64.cpp
index 7714290b87d..db37211505d 100644
--- a/test/gemm/gemm_fp64.cpp
+++ b/test/gemm/gemm_xdl_fp64.cpp
@@ -15,7 +15,6 @@
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "device_gemm_xdl.hpp"
-#include "device_gemm_xdl_c_shuffle.hpp"
 #include "element_wise_operation.hpp"
 #include "reference_gemm.hpp"
 #include "gemm_specialization.hpp"

From 1df7eba2e083a24d84b4ffc736e03ad3d3e52cc5 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Wed, 25 May 2022 02:58:23 +0000
Subject: [PATCH 29/32] format

---
 example/01_gemm/gemm_xdl_fp64.cpp             | 6 +++---
 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 2 +-
 test/gemm/gemm_util.hpp                       | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
index ff05c3d5417..150d547264e 100644
--- a/example/01_gemm/gemm_xdl_fp64.cpp
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -85,7 +85,7 @@ std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
 int main(int argc, char* argv[])
 {
     bool do_verification = 0;
-    int  init_method     = 0;
+    int init_method      = 0;
     bool time_kernel     = false;
 
     // GEMM shape
@@ -101,13 +101,13 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        time_kernel         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
     }
     else if(argc == 10)
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        time_kernel         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
 
         M = std::stoi(argv[4]);
         N = std::stoi(argv[5]);
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
index 7149c4f2779..52440e0d5f1 100644
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
     {
         do_verification = std::stoi(argv[1]);
         init_method     = std::stoi(argv[2]);
-        time_kernel         = std::stoi(argv[3]);
+        time_kernel     = std::stoi(argv[3]);
         num_dim_spatial = std::stoi(argv[4]);
     }
 
diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
index 94151f9eed7..a3cafa6df16 100644
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -223,7 +223,6 @@ struct TestGemm
                 std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
             }
 
-
             return res;
         }
         else

From bb4b9f088a5405cdb63ac6bb5d46e26fa3f432c3 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Wed, 25 May 2022 03:05:01 +0000
Subject: [PATCH 30/32] remove test_gemm_xdl_fp64

---
 test/gemm/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt
index 6716bbb1b5c..b8679e37157 100644
--- a/test/gemm/CMakeLists.txt
+++ b/test/gemm/CMakeLists.txt
@@ -1,8 +1,4 @@
 # GEMM XDL
-add_test_executable(test_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
-target_link_libraries(test_gemm_xdl_fp64 PRIVATE host_tensor)
-target_link_libraries(test_gemm_xdl_fp64 PRIVATE device_gemm_instance)
-
 add_test_executable(test_gemm_xdl_fp32 gemm_xdl_fp32.cpp)
 target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor)
 target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance)

From 2b6d3e4c62604327b62b2dcb0500a75cfe6cefd5 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Wed, 25 May 2022 06:58:32 +0000
Subject: [PATCH 31/32] add AccDataType

---
 test/gemm/gemm_dl_fp16.cpp | 11 ++++++++---
 test/gemm/gemm_dl_fp32.cpp | 11 ++++++++---
 test/gemm/gemm_dl_int8.cpp | 11 ++++++++---
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/test/gemm/gemm_dl_fp16.cpp b/test/gemm/gemm_dl_fp16.cpp
index 6165355ec41..8a539372bad 100644
--- a/test/gemm/gemm_dl_fp16.cpp
+++ b/test/gemm/gemm_dl_fp16.cpp
@@ -43,9 +43,10 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(std::vector<DeviceGemmNoO
 
 int main()
 {
-    using ADataType = ck::half_t;
-    using BDataType = ck::half_t;
-    using CDataType = ck::half_t;
+    using ADataType   = ck::half_t;
+    using BDataType   = ck::half_t;
+    using CDataType   = ck::half_t;
+    using AccDataType = float;
 
     using RowMajor    = ck::tensor_layout::gemm::RowMajor;
     using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
@@ -63,6 +64,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        RowMajor,
                                        RowMajor,
@@ -81,6 +83,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        ColumnMajor,
                                        RowMajor,
@@ -99,6 +102,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        RowMajor,
                                        RowMajor,
@@ -117,6 +121,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        ColumnMajor,
                                        RowMajor,
diff --git a/test/gemm/gemm_dl_fp32.cpp b/test/gemm/gemm_dl_fp32.cpp
index cd0f8167315..3484458042e 100644
--- a/test/gemm/gemm_dl_fp32.cpp
+++ b/test/gemm/gemm_dl_fp32.cpp
@@ -43,9 +43,10 @@ void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(std::vector<DeviceGemmNoO
 
 int main()
 {
-    using ADataType = float;
-    using BDataType = float;
-    using CDataType = float;
+    using ADataType   = float;
+    using BDataType   = float;
+    using CDataType   = float;
+    using AccDataType = float;
 
     using RowMajor    = ck::tensor_layout::gemm::RowMajor;
     using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
@@ -61,6 +62,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        RowMajor,
                                        RowMajor,
@@ -79,6 +81,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        ColumnMajor,
                                        RowMajor,
@@ -97,6 +100,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        RowMajor,
                                        RowMajor,
@@ -115,6 +119,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        ColumnMajor,
                                        RowMajor,
diff --git a/test/gemm/gemm_dl_int8.cpp b/test/gemm/gemm_dl_int8.cpp
index 72b9f1440fe..5dfb7221cb6 100644
--- a/test/gemm/gemm_dl_int8.cpp
+++ b/test/gemm/gemm_dl_int8.cpp
@@ -43,9 +43,10 @@ void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPt
 
 int main()
 {
-    using ADataType = int8_t;
-    using BDataType = int8_t;
-    using CDataType = int8_t;
+    using ADataType   = int8_t;
+    using BDataType   = int8_t;
+    using CDataType   = int8_t;
+    using AccDataType = int;
 
     using RowMajor    = ck::tensor_layout::gemm::RowMajor;
     using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
@@ -61,6 +62,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        RowMajor,
                                        RowMajor,
@@ -79,6 +81,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        ColumnMajor,
                                        ColumnMajor,
                                        RowMajor,
@@ -97,6 +100,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        RowMajor,
                                        RowMajor,
@@ -115,6 +119,7 @@ int main()
                                        ADataType,
                                        BDataType,
                                        CDataType,
+                                       AccDataType,
                                        RowMajor,
                                        ColumnMajor,
                                        RowMajor,

From 888acf360ac9b3f997f2339659f65c5b5432a466 Mon Sep 17 00:00:00 2001
From: qinletao <letaoqin@amd.com>
Date: Wed, 25 May 2022 08:05:35 +0000
Subject: [PATCH 32/32] AccDataType problem

---
 example/01_gemm/gemm_dl_fp16.cpp                               | 2 +-
 example/01_gemm/gemm_dl_fp32.cpp                               | 2 +-
 example/01_gemm/gemm_dl_int8.cpp                               | 2 +-
 .../gemm_xdl_requant_relu_requant_int8.cpp                     | 2 +-
 example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp            | 3 ++-
 example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp  | 1 +
 6 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/example/01_gemm/gemm_dl_fp16.cpp
index 6e8e04f9e51..63d96a8e991 100644
--- a/example/01_gemm/gemm_dl_fp16.cpp
+++ b/example/01_gemm/gemm_dl_fp16.cpp
@@ -52,7 +52,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/example/01_gemm/gemm_dl_fp32.cpp
index 65c806bf07e..20ca1a4d3d0 100644
--- a/example/01_gemm/gemm_dl_fp32.cpp
+++ b/example/01_gemm/gemm_dl_fp32.cpp
@@ -51,7 +51,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/01_gemm/gemm_dl_int8.cpp b/example/01_gemm/gemm_dl_int8.cpp
index a9590030c7f..caedb22537b 100644
--- a/example/01_gemm/gemm_dl_int8.cpp
+++ b/example/01_gemm/gemm_dl_int8.cpp
@@ -49,7 +49,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
index c4bbe3cf4d2..a42df2b7f06 100644
--- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
@@ -103,7 +103,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
 using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                         BDataType,
                                                                         CDataType,
-                                                                        AccDataType,
+                                                                        float,
                                                                         PassThrough,
                                                                         PassThrough,
                                                                         RequantReluRequant>;
diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
index 4d837c4675c..ef3dc03ebc7 100644
--- a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
@@ -32,6 +32,7 @@ using CDataType         = F16;
 using ReduceAccDataType = F32;
 using DDataType         = F64;
 using DPtrsGlobal       = ck::Tuple<DDataType*>;
+using AccDataType       = F32;
 
 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
@@ -59,7 +60,7 @@ using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_
 // clang-format on
 
 using ReferenceGemmInstance = ck::tensor_operation::host::
-    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
 
 int main(int argc, char* argv[])
 {
diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp
index f818059f4a6..2b58eb20880 100644
--- a/example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp
@@ -32,6 +32,7 @@ using CDataType         = F16;
 using ReduceAccDataType = F32;
 using DDataType         = F32;
 using DPtrsGlobal       = ck::Tuple<DDataType*, DDataType*>;
+using AccDataType       = F32;
 
 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;