From c19ea8e37fc8b4cfa9430279b7974ecb588d489f Mon Sep 17 00:00:00 2001 From: ltqin Date: Tue, 19 Apr 2022 15:34:19 +0800 Subject: [PATCH 01/32] add intrin_mfma_f64_16x16x4f64 --- .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 23 +++++++++++++++++++ include/ck/utility/amd_xdlops.hpp | 13 +++++++++++ 2 files changed, 36 insertions(+) diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp index 9d72abb72ea..cc993e012ba 100644 --- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp @@ -25,6 +25,7 @@ enum struct MfmaInstr mfma_f32_16x16x8bf16, mfma_i32_32x32x8i8, mfma_i32_16x16x16i8, + mfma_f64_16x16x4f64, }; template @@ -383,6 +384,28 @@ struct mfma_type } }; +template <> +struct mfma_type +{ + static constexpr index_t group_size = 4; + static constexpr index_t num_groups_per_blk = 1; + static constexpr index_t num_regs_per_blk = group_size * num_groups_per_blk; + static constexpr index_t num_threads_per_blk = 16; + static constexpr index_t wave_size = 64; + static constexpr index_t num_input_blks = wave_size / num_threads_per_blk; + static constexpr index_t num_output_blks = 1; + static constexpr index_t m_per_blk = 16; + static constexpr index_t n_per_blk = 16; + static constexpr index_t k_per_blk = 4; + static constexpr bool is_k_reduction = true; + + template + __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const + { + intrin_mfma_f64_16x16x4f64::Run(a, b, reg_c); + } +}; + template struct MfmaSelector { diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index 91d109bae10..09f10dd32f3 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -294,5 +294,18 @@ struct intrin_mfma_i32_16x16x16i8<16, 16> } }; +template +struct intrin_mfma_f64_16x16x4f64; + +template <> +struct intrin_mfma_f64_16x16x4f64<16, 16> +{ + template + __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c) + { + reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); + } +}; } // namespace ck #endif From bf5af9f91cf2ab0970bd272a5329001188a9f8be Mon Sep 17 00:00:00 2001 From: ltqin Date: Wed, 20 Apr 2022 15:42:48 +0800 Subject: [PATCH 02/32] add example --- example/01_gemm/CMakeLists.txt | 1 + example/01_gemm/gemm_xdl_fp64.cpp | 205 ++++++++++++++++++ .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 16 +- include/ck/utility/amd_xdlops.hpp | 2 +- 4 files changed, 218 insertions(+), 6 deletions(-) create mode 100644 example/01_gemm/gemm_xdl_fp64.cpp diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 696d3bac42d..6da2ecb0c52 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -1,3 +1,4 @@ add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp) add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp) add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp) +add_example_executable(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp new file mode 100644 index 00000000000..f62c787d2e5 --- /dev/null +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include + +#include "check_err.hpp" +#include "config.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "device_tensor.hpp" +#include "device_gemm_xdl.hpp" +#include "device_gemm_xdl_c_shuffle.hpp" +#include "device_gemm_xdl_cshuffle.hpp" +#include "element_wise_operation.hpp" +#include "reference_gemm.hpp" +#include "gemm_specialization.hpp" + +template +using S = ck::Sequence; + +using F64 = double; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = double; +using BDataType = double; +using CDataType = double; +using AccDataType = double; + +using ALayout = ck::tensor_layout::gemm::RowMajor; +using BLayout = ck::tensor_layout::gemm::ColumnMajor; +using CLayout = ck::tensor_layout::gemm::RowMajor; + +using AElementOp = ck::tensor_operation::element_wise::PassThrough; +using BElementOp = ck::tensor_operation::element_wise::PassThrough; +using CElementOp = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +// clang-format off +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl +//##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| +//##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| +//##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| +//##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 4, 2, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>; +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; + +int main(int argc, char* argv[]) +{ + bool do_verification = 0; + int init_method = 0; + int nrepeat = 5; + + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; + + if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + } + else if(argc == 10) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + + StrideA = std::stoi(argv[7]); + StrideB = std::stoi(argv[8]); + StrideC = std::stoi(argv[9]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: run kernel # of times (>1)\n"); + printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); + exit(0); + } + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(std::is_same::value) + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({stride, 1})); + } + else + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({1, stride})); + } + }; + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + case 2: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{}); + b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + } + + DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); + DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); + + a_m_k_device_buf.ToDevice(a_m_k.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n.mData.data()); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + auto gemm = DeviceGemmInstance{}; + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument(static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op); + + if(!gemm.IsSupportedArgument(argument)) + { + throw std::runtime_error( + "wrong! device_gemm with the specified compilation parameters does " + "not support this GEMM problem"); + } + + float ave_time = invoker.Run(argument, nrepeat); + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << gemm.GetTypeString() << std::endl; + + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + if(do_verification) + { + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); + + ref_invoker.Run(ref_argument); + + ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + } + + return 0; +} diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp index cc993e012ba..cb3ca891324 100644 --- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp @@ -389,14 +389,14 @@ struct mfma_type { static constexpr index_t group_size = 4; static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = group_size * num_groups_per_blk; + static constexpr index_t num_regs_per_blk = 4; //group_size * num_groups_per_blk; static constexpr index_t num_threads_per_blk = 16; static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = wave_size / num_threads_per_blk; + static constexpr index_t num_input_blks = 4; //wave_size / num_threads_per_blk; static constexpr index_t num_output_blks = 1; static constexpr index_t m_per_blk = 16; static constexpr index_t n_per_blk = 16; - static constexpr index_t k_per_blk = 4; + static constexpr index_t k_per_blk = 1; static constexpr bool is_k_reduction = true; template @@ -412,6 +412,12 @@ struct MfmaSelector template static constexpr auto GetMfma(); + template <> + static constexpr auto GetMfma() + { + return MfmaInstr::mfma_f64_16x16x4f64; + } + template <> static constexpr auto GetMfma() { @@ -684,9 +690,9 @@ struct XdlopsGemm template __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const { - static_assert(is_same::value || is_same::value || + static_assert(is_same::value ||is_same::value || is_same::value || is_same::value || is_same::value, - "base base_type must be float, half, bfloat16, and int8_t!"); + "base base_type must be double, float, half, bfloat16, and int8_t!"); static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) { mfma_instr.template run(p_a_wave[k], p_b_wave[k], p_c_thread); diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index 09f10dd32f3..7ab6f7f7cce 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -298,7 +298,7 @@ template struct intrin_mfma_f64_16x16x4f64; template <> -struct intrin_mfma_f64_16x16x4f64<16, 16> +struct intrin_mfma_f64_16x16x4f64<32, 32> { template __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c) From 6c8ca54bb6ed2d2bbc2853ed20e0d377cf7c390d Mon Sep 17 00:00:00 2001 From: ltqin Date: Wed, 20 Apr 2022 22:52:16 +0800 Subject: [PATCH 03/32] gemm reference add double data type --- example/01_gemm/gemm_xdl_bf16.cpp | 2 +- example/01_gemm/gemm_xdl_fp16.cpp | 2 +- example/01_gemm/gemm_xdl_fp64.cpp | 2 +- example/01_gemm/gemm_xdl_int8.cpp | 9 ++++++-- .../conv2d_bwd_weight_xdl.cpp | 9 ++++++-- .../gemm_xdl_requant_relu_requant_int8.cpp | 9 ++++++-- .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp | 2 +- .../16_gemm_reduce/gemm_reduce_xdl_fp16.cpp | 2 +- .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 23 ++++++++++--------- .../cpu/reference_gemm.hpp | 9 ++++---- profiler/include/profile_gemm_impl.hpp | 12 ++++++++-- profiler/include/profile_gemm_reduce_impl.hpp | 9 ++++++-- .../include/profile_grouped_gemm_impl.hpp | 2 ++ profiler/src/profile_gemm.cpp | 16 +++++++++++++ test/gemm/gemm_fp16.cpp | 11 ++++++--- test/gemm/gemm_fp32.cpp | 15 ++++++++---- test/gemm/gemm_int8.cpp | 15 ++++++++---- test/gemm/gemm_util.hpp | 3 +++ test/grouped_gemm/grouped_gemm_fp16.cpp | 9 ++++++-- 19 files changed, 116 insertions(+), 45 deletions(-) diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp index 8f0631c1cec..fd24d516222 100644 --- a/example/01_gemm/gemm_xdl_bf16.cpp +++ b/example/01_gemm/gemm_xdl_bf16.cpp @@ -81,7 +81,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp index 2d5a95e400c..1e5c1adb715 100644 --- a/example/01_gemm/gemm_xdl_fp16.cpp +++ b/example/01_gemm/gemm_xdl_fp16.cpp @@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index f62c787d2e5..579dd186ef8 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -54,7 +54,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp index 724757565ea..3d92490bdcf 100644 --- a/example/01_gemm/gemm_xdl_int8.cpp +++ b/example/01_gemm/gemm_xdl_int8.cpp @@ -80,8 +80,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle 4>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on -using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; +using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp index 7b74b40d328..bf78cc87e06 100644 --- a/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp +++ b/example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp @@ -72,8 +72,13 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device:: 8>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on -using ReferenceConvBwdWeightInstance = ck::tensor_operation::host:: - ReferenceConvBwdWeight; +using ReferenceConvBwdWeightInstance = + ck::tensor_operation::host::ReferenceConvBwdWeight; int main(int argc, char* argv[]) { diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp index ca3b58bd00a..c125d6710d4 100644 --- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp +++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp @@ -83,8 +83,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle 16>; // CBlockTransferScalarPerVector_NWaveNPerXdl // clang-format on -using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; +using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp index 4e9bdbb2f5b..30d852bccda 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp @@ -56,7 +56,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp index 0346075c368..7142e8a1235 100644 --- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp +++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp @@ -52,7 +52,7 @@ using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_ // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp index cb3ca891324..6728e2b89ac 100644 --- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp @@ -387,17 +387,17 @@ struct mfma_type template <> struct mfma_type { - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; //group_size * num_groups_per_blk; + static constexpr index_t group_size = 4; + static constexpr index_t num_groups_per_blk = 1; + static constexpr index_t num_regs_per_blk = 4; // group_size * num_groups_per_blk; static constexpr index_t num_threads_per_blk = 16; static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 4; //wave_size / num_threads_per_blk; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 16; - static constexpr index_t n_per_blk = 16; - static constexpr index_t k_per_blk = 1; - static constexpr bool is_k_reduction = true; + static constexpr index_t num_input_blks = 4; // wave_size / num_threads_per_blk; + static constexpr index_t num_output_blks = 1; + static constexpr index_t m_per_blk = 16; + static constexpr index_t n_per_blk = 16; + static constexpr index_t k_per_blk = 1; + static constexpr bool is_k_reduction = true; template __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const @@ -690,8 +690,9 @@ struct XdlopsGemm template __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const { - static_assert(is_same::value ||is_same::value || is_same::value || - is_same::value || is_same::value, + static_assert(is_same::value || is_same::value || + is_same::value || is_same::value || + is_same::value, "base base_type must be double, float, half, bfloat16, and int8_t!"); static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) { diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index 3601fafc281..e659f2d0e82 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -13,6 +13,7 @@ namespace host { template @@ -55,12 +56,12 @@ struct ReferenceGemm : public device::BaseOperator auto f_mk_kn_mn = [&](auto m, auto n) { const int K = arg.a_m_k_.mDesc.GetLengths()[1]; - float v_acc = 0; + AccDataType v_acc = 0; for(int k = 0; k < K; ++k) { - float v_a; - float v_b; + AccDataType v_a; + AccDataType v_b; arg.a_element_op_(v_a, static_cast(arg.a_m_k_(m, k))); arg.b_element_op_(v_b, static_cast(arg.b_k_n_(k, n))); @@ -68,7 +69,7 @@ struct ReferenceGemm : public device::BaseOperator v_acc += v_a * v_b; } - float v_c; + AccDataType v_c; arg.c_element_op_(v_c, v_acc); diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp index f2661888442..0faac583753 100644 --- a/profiler/include/profile_gemm_impl.hpp +++ b/profiler/include/profile_gemm_impl.hpp @@ -85,6 +85,7 @@ namespace profiler { template @@ -457,8 +458,14 @@ void profile_gemm_impl(int do_verification, bf16_to_f32_(b_k_n, b_f32_k_n); bf16_to_f32_(c_m_n_device_result, c_m_n_device_f32_result); - using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + using ReferenceGemmInstance = + ck::tensor_operation::host::ReferenceGemm; auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); @@ -490,6 +497,7 @@ void profile_gemm_impl(int do_verification, ck::tensor_operation::host::ReferenceGemm; diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp index e103aeff99e..ca4f4f63fad 100644 --- a/profiler/include/profile_gemm_reduce_impl.hpp +++ b/profiler/include/profile_gemm_reduce_impl.hpp @@ -127,8 +127,13 @@ bool profile_gemm_reduce_impl(int do_verification, if(do_verification) { - using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp index cced480c36c..8bb87f97632 100644 --- a/profiler/include/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profile_grouped_gemm_impl.hpp @@ -43,6 +43,7 @@ namespace profiler { template @@ -270,6 +271,7 @@ void profile_grouped_gemm_impl(int do_verification, ck::tensor_operation::host::ReferenceGemm; diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp index 7a72be2d8e9..ec70bb49727 100644 --- a/profiler/src/profile_gemm.cpp +++ b/profiler/src/profile_gemm.cpp @@ -68,6 +68,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -88,6 +89,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -108,6 +110,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -128,6 +131,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -146,6 +150,7 @@ int profile_gemm(int argc, char* argv[]) else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) { ck::profiler::profile_gemm_impl( @@ -248,6 +257,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -268,6 +278,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -288,6 +299,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -308,6 +320,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -328,6 +341,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -348,6 +362,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( @@ -368,6 +383,7 @@ int profile_gemm(int argc, char* argv[]) ck::profiler::profile_gemm_impl( diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp index d7669bb2425..b2c15804ccb 100644 --- a/test/gemm/gemm_fp16.cpp +++ b/test/gemm/gemm_fp16.cpp @@ -52,9 +52,10 @@ void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances( int main() { - using ADataType = ck::half_t; - using BDataType = ck::half_t; - using CDataType = ck::half_t; + using ADataType = ck::half_t; + using BDataType = ck::half_t; + using CDataType = ck::half_t; + using AccDataType = float; using RowMajor = ck::tensor_layout::gemm::RowMajor; using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; @@ -74,6 +75,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, ColumnMajor, RowMajor, RowMajor, @@ -96,6 +98,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, ColumnMajor, ColumnMajor, RowMajor, @@ -118,6 +121,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, RowMajor, RowMajor, RowMajor, @@ -142,6 +146,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, RowMajor, ColumnMajor, RowMajor, diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp index 6c86085f3b8..72ff08f427f 100644 --- a/test/gemm/gemm_fp32.cpp +++ b/test/gemm/gemm_fp32.cpp @@ -53,12 +53,13 @@ void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(std::vector gemmPtrs; @@ -75,6 +76,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, ColumnMajor, RowMajor, RowMajor, @@ -97,6 +99,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, ColumnMajor, ColumnMajor, RowMajor, @@ -119,6 +122,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, RowMajor, RowMajor, RowMajor, @@ -141,6 +145,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, RowMajor, ColumnMajor, RowMajor, diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp index 864fca8df4d..cc38da757a0 100644 --- a/test/gemm/gemm_int8.cpp +++ b/test/gemm/gemm_int8.cpp @@ -46,12 +46,13 @@ void add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances( int main() { - using ADataType = int8_t; - using BDataType = int8_t; - using CDataType = int8_t; + using ADataType = int8_t; + using BDataType = int8_t; + using CDataType = int8_t; + using AccDataType = int32_t, - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; + using RowMajor = ck::tensor_layout::gemm::RowMajor; + using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; std::vector gemmPtrs; bool res = true; @@ -65,6 +66,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, ColumnMajor, RowMajor, RowMajor, @@ -83,6 +85,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, ColumnMajor, ColumnMajor, RowMajor, @@ -101,6 +104,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, RowMajor, RowMajor, RowMajor, @@ -119,6 +123,7 @@ int main() ADataType, BDataType, CDataType, + AccDataType, RowMajor, ColumnMajor, RowMajor, diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 08c8edfb94b..f8de85ea883 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -106,6 +106,7 @@ template ; @@ -306,6 +308,7 @@ struct TestGemmBF16 // use fp32 host kernel to verify bf16 device kernel using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmFromDevice(c_device_tensors[i].mData.data()); - using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); From 10a2ae2f76405bd63c58e451da867ce87766ca2d Mon Sep 17 00:00:00 2001 From: qinletao Date: Thu, 21 Apr 2022 02:11:29 +0000 Subject: [PATCH 04/32] chang init data --- example/01_gemm/gemm_xdl_fp64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index 579dd186ef8..b19dc6266c2 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -135,8 +135,8 @@ int main(int argc, char* argv[]) b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{}); - b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); + a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); } DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); From 873d09585a0572bdf11ddbb7794e493d632a5034 Mon Sep 17 00:00:00 2001 From: ltqin Date: Thu, 21 Apr 2022 13:21:24 +0800 Subject: [PATCH 05/32] fix M N PerXdlops --- example/01_gemm/gemm_xdl_fp64.cpp | 9 ++++++++- .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 20 +++++++++---------- include/ck/utility/amd_xdlops.hpp | 7 ++++++- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index b19dc6266c2..6a323e4fc25 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -50,7 +50,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 256, 128, 4, 2, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>; + < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>; // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: @@ -198,6 +198,13 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); + if(0) + { + LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; + LogRangeAsType(std::cout << "c_device: ", c_m_n_device_result.mData, ",") + << std::endl; + } ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); } diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp index 6728e2b89ac..4d9a40e9547 100644 --- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp @@ -387,17 +387,17 @@ struct mfma_type template <> struct mfma_type { - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; - static constexpr index_t num_regs_per_blk = 4; // group_size * num_groups_per_blk; + static constexpr index_t group_size = 4; + static constexpr index_t num_groups_per_blk = 1; + static constexpr index_t num_regs_per_blk = 4; // group_size * num_groups_per_blk; static constexpr index_t num_threads_per_blk = 16; static constexpr index_t wave_size = 64; - static constexpr index_t num_input_blks = 4; // wave_size / num_threads_per_blk; - static constexpr index_t num_output_blks = 1; - static constexpr index_t m_per_blk = 16; - static constexpr index_t n_per_blk = 16; - static constexpr index_t k_per_blk = 1; - static constexpr bool is_k_reduction = true; + static constexpr index_t num_input_blks = 4; // wave_size / num_threads_per_blk; + static constexpr index_t num_output_blks = 1; + static constexpr index_t m_per_blk = 16; + static constexpr index_t n_per_blk = 16; + static constexpr index_t k_per_blk = 1; + static constexpr bool is_k_reduction = true; template __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const @@ -413,7 +413,7 @@ struct MfmaSelector static constexpr auto GetMfma(); template <> - static constexpr auto GetMfma() + static constexpr auto GetMfma() { return MfmaInstr::mfma_f64_16x16x4f64; } diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index 7ab6f7f7cce..00014dff2ad 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -298,13 +298,18 @@ template struct intrin_mfma_f64_16x16x4f64; template <> -struct intrin_mfma_f64_16x16x4f64<32, 32> +struct intrin_mfma_f64_16x16x4f64<16, 16> { template __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c) { +#ifdef __gxf90a__ reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64( reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); +#else + reg_c.template AsType()(Number<0>{}) = {reg_a, reg_a, reg_b, reg_b}; + +#endif } }; } // namespace ck From eff586ac3a27b0c4ced4bf69aa39119638f5bb96 Mon Sep 17 00:00:00 2001 From: qinletao Date: Thu, 21 Apr 2022 14:58:36 +0000 Subject: [PATCH 06/32] fix ifdef --- example/01_gemm/gemm_xdl_fp64.cpp | 12 ++++++++---- include/ck/utility/amd_xdlops.hpp | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index 6a323e4fc25..fe2bab01cd3 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -50,7 +50,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>; + < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, 7, 1>; // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: @@ -135,8 +135,10 @@ int main(int argc, char* argv[]) b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: + //a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + //b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); } DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); @@ -198,13 +200,15 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - if(0) + #if 1 { LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; - LogRangeAsType(std::cout << "c_device: ", c_m_n_device_result.mData, ",") + LogRangeAsType(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl; + LogRangeAsType(std::cout << "c_host: ", c_m_n_host_result.mData, ",") << std::endl; } +#endif ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); } diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index 00014dff2ad..e4f24ee2d5a 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -303,7 +303,7 @@ struct intrin_mfma_f64_16x16x4f64<16, 16> template __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c) { -#ifdef __gxf90a__ +#ifdef __gfx90a__ reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64( reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); #else From d443a7a675b1c84a88e15e9fd6cce8d30f76f0d1 Mon Sep 17 00:00:00 2001 From: qinletao Date: Thu, 21 Apr 2022 16:16:42 +0000 Subject: [PATCH 07/32] add comparsion config --- example/01_gemm/gemm_xdl_fp64.cpp | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index fe2bab01cd3..d3a42f45f76 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -23,16 +23,13 @@ using S = ck::Sequence; using F64 = double; using F32 = float; +using F16 = ck::half_t; using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using ADataType = double; -using BDataType = double; -using CDataType = double; -using AccDataType = double; using ALayout = ck::tensor_layout::gemm::RowMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor; @@ -50,7 +47,19 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, 7, 1>; +#if 1 + < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>; +using ADataType = double; +using BDataType = double; +using CDataType = double; +using AccDataType = double; +#else + < F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 4, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 7, 1>; +using ADataType = float; +using BDataType = float; +using CDataType = float; +using AccDataType = float; +#endif // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: @@ -135,8 +144,8 @@ int main(int argc, char* argv[]) b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - //a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + //a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); //b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); } @@ -205,7 +214,7 @@ int main(int argc, char* argv[]) LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; LogRangeAsType(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl; - LogRangeAsType(std::cout << "c_host: ", c_m_n_host_result.mData, ",") + LogRangeAsType(std::cout << "c_host : ", c_m_n_host_result.mData, ",") << std::endl; } #endif From dcdbed2a47a7cff9ded53df524965d26aaa5a5af Mon Sep 17 00:00:00 2001 From: qinletao Date: Sat, 23 Apr 2022 04:32:16 +0000 Subject: [PATCH 08/32] add conv fwd example --- example/01_gemm/gemm_xdl_fp64.cpp | 19 +- example/09_convnd_fwd/CMakeLists.txt | 1 + example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 346 ++++++++++++++++++ 3 files changed, 356 insertions(+), 10 deletions(-) create mode 100644 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index d3a42f45f76..625d11d9459 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -30,7 +30,6 @@ using Col = ck::tensor_layout::gemm::ColumnMajor; using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using ALayout = ck::tensor_layout::gemm::RowMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor; using CLayout = ck::tensor_layout::gemm::RowMajor; @@ -48,7 +47,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | #if 1 - < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>; + < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 4, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 7, 1>; using ADataType = double; using BDataType = double; using CDataType = double; @@ -144,10 +143,10 @@ int main(int argc, char* argv[]) b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - //a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + // a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - //b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + // b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); } DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); @@ -209,12 +208,12 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - #if 1 +#if 0 { - LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; - LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; - LogRangeAsType(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl; - LogRangeAsType(std::cout << "c_host : ", c_m_n_host_result.mData, ",") + LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; + LogRangeAsType(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl; + LogRangeAsType(std::cout << "c_host : ", c_m_n_host_result.mData, ",") << std::endl; } #endif diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt index fd6d11d9ff2..a142074696c 100644 --- a/example/09_convnd_fwd/CMakeLists.txt +++ b/example/09_convnd_fwd/CMakeLists.txt @@ -1,3 +1,4 @@ add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp) add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp) add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp) +add_example_executable(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp) diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp new file mode 100644 index 00000000000..e933dd3dc56 --- /dev/null +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp @@ -0,0 +1,346 @@ +#include +#include +#include +#include + +#include "check_err.hpp" +#include "config.hpp" +#include "conv_fwd_util.hpp" +#include "device.hpp" +#include "device_tensor.hpp" +#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" +#include "element_wise_operation.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "reference_conv_fwd.hpp" +#include "tensor_layout.hpp" + +namespace { + +using InDataType = double; +using WeiDataType = double; +using OutDataType = double; +using AccDataType = double; + +template +using S = ck::Sequence; + +using InElementOp = ck::tensor_operation::element_wise::PassThrough; +using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto ConvFwdDefault = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +using DeviceConvFwdBasePtr = + ck::tensor_operation::device::DeviceConvFwdPtr; + +template +using DeviceConvNDFwdInstance = ck::tensor_operation::device:: + DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< + // clang-format off + InDataType, // + WeiDataType, // + OutDataType, // + AccDataType, // + InElementOp, // Input Elementwise Operation + WeiElementOp, // Weights Elementwise Operation + OutElementOp, // Output Elementwise Operation + ConvFwdDefault, // ConvForwardSpecialization + NumDimSpatial, // NumDimSpatial + 256, // BlockSize + 128, // MPerBlock + 128, // NPerBlock + 4, // K0PerBlock + 2, // K1 + 16, // MPerXDL + 16, // NPerXDL + 4, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 2, // ABlockTransferSrcScalarPerVector + 2, // ABlockTransferDstScalarPerVector_K1 + true, // ABlockLdsAddExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 2, // BBlockTransferSrcScalarPerVector + 2, // BBlockTransferDstScalarPerVector_K1 + true, // BBlockTransferAddExtraN + 7, // CThreadTransferSrcDstVectorDim + 1>; // CThreadTransferDstScalarPerVector +// clang-format on + +template +using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd; + +DeviceConvFwdBasePtr get_conv_instance(int num_dim_spatial) +{ + switch(num_dim_spatial) + { + case 3: { + return std::make_unique>(); + } + case 2: { + return std::make_unique>(); + } + case 1: { + return std::make_unique>(); + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } +} + +void print_use_msg() +{ + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: run kernel # of times (>1)\n" + << "arg4: N spatial dimensions (default 2)\n" + << "Following arguments (depending on number of spatial dims):\n" + << " N, K, C, \n" + << " , (ie Y, X for 2D)\n" + << " , (ie Hi, Wi for 2D)\n" + << " , (ie Sy, Sx for 2D)\n" + << " , (ie Dy, Dx for 2D)\n" + << " , (ie LeftPy, LeftPx for 2D)\n" + << " , (ie RightPy, RightPx for 2D)\n" + << std::endl; +} + +ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, char* argv[]) +{ + // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) + int conv_args = 3 + num_dim_spatial * 6; + int cmdline_nargs = conv_args + 5; + if(cmdline_nargs != argc) + { + print_use_msg(); + exit(0); + } + + ck::utils::conv::ConvParams params; + int arg_idx = 5; + + params.num_dim_spatial = num_dim_spatial; + params.N = std::stoi(argv[arg_idx++]); + params.K = std::stoi(argv[arg_idx++]); + params.C = std::stoi(argv[arg_idx++]); + + params.filter_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.input_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_strides.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_dilations.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + } + params.input_left_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + } + params.input_right_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + } + + return params; +} + +} // anonymous namespace + +int main(int argc, char* argv[]) +{ + using namespace ck::utils::conv; + + bool do_verification = 0; + int init_method = 0; + int nrepeat = 5; + int num_dim_spatial = 2; + + ck::utils::conv::ConvParams params; + + if(argc >= 5) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + nrepeat = std::stoi(argv[3]); + num_dim_spatial = std::stoi(argv[4]); + } + + if(argc >= 6) + { + params = parse_conv_params(num_dim_spatial, argc, argv); + } + + std::vector input_dims{static_cast(params.N), + static_cast(params.C)}; + input_dims.insert(std::end(input_dims), + std::begin(params.input_spatial_lengths), + std::end(params.input_spatial_lengths)); + + std::vector filter_dims{static_cast(params.K), + static_cast(params.C)}; + filter_dims.insert(std::end(filter_dims), + std::begin(params.filter_spatial_lengths), + std::end(params.filter_spatial_lengths)); + + const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); + std::vector output_dims{static_cast(params.N), + static_cast(params.K)}; + output_dims.insert(std::end(output_dims), + std::begin(output_spatial_lengths), + std::end(output_spatial_lengths)); + + Tensor input(get_input_host_tensor_descriptor(input_dims, num_dim_spatial)); + Tensor weights(get_filters_host_tensor_descriptor(filter_dims, num_dim_spatial)); + Tensor host_output( + get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + Tensor device_output( + get_output_host_tensor_descriptor(output_dims, num_dim_spatial)); + + std::cout << "input: " << input.mDesc << std::endl; + std::cout << "weights: " << weights.mDesc << std::endl; + std::cout << "output: " << host_output.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weights.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + case 2: + input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + weights.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + default: + // input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weights.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + input.GenerateTensorValue(GeneratorTensor_1{1}); + // weights.GenerateTensorValue(GeneratorTensor_1{1}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); + DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace()); + + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weights.mData.data()); + + // do GEMM + auto conv = get_conv_instance(num_dim_spatial); + auto invoker = conv->MakeInvokerPointer(); + auto argument = + conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), + static_cast(wei_device_buf.GetDeviceBuffer()), + static_cast(out_device_buf.GetDeviceBuffer()), + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + if(!conv->IsSupportedArgument(argument.get())) + { + throw std::runtime_error( + "wrong! device_conv with the specified compilation parameters does " + "not support this Conv problem"); + } + + float ave_time = invoker->Run(argument.get(), nrepeat); + + std::size_t flop = get_flops( + params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + std::size_t num_btype = + get_btype(params.N, + params.C, + params.K, + params.input_spatial_lengths, + params.filter_spatial_lengths, + output_spatial_lengths); + + float tflops = static_cast(flop) / 1.E9 / ave_time; + float gb_per_sec = num_btype / 1.E6 / ave_time; + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" + << std::endl; + + if(do_verification) + { + auto verify_f = [&input, &weights, &host_output, ¶ms, &out_device_buf, &device_output]( + const auto& ref_conv) { + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(input, + weights, + host_output, + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + ref_invoker.Run(ref_argument); + out_device_buf.FromDevice(device_output.mData.data()); + ck::utils::check_err( + host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + }; + + switch(num_dim_spatial) + { + case 3: { + auto ref_conv = ReferenceConvNDFwdInstance<3>(); + verify_f(ref_conv); + break; + } + case 2: { + auto ref_conv = ReferenceConvNDFwdInstance<2>(); + verify_f(ref_conv); + break; + } + case 1: { + auto ref_conv = ReferenceConvNDFwdInstance<1>(); + verify_f(ref_conv); + break; + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } + } +} From ef77a1cacf25da5d385e2cff92eb1eea8c59f19f Mon Sep 17 00:00:00 2001 From: qinletao Date: Sun, 24 Apr 2022 10:54:02 +0000 Subject: [PATCH 09/32] format log out --- example/01_gemm/gemm_xdl_fp64.cpp | 34 ++++++++++++++----- example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 8 ++--- .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 2 +- .../cpu/reference_gemm.hpp | 4 +-- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index 625d11d9459..bed6b9b1575 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -47,13 +47,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | #if 1 - < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 4, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 7, 1>; + < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 32, 32, 4, 1, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, 7, 1>; using ADataType = double; using BDataType = double; using CDataType = double; using AccDataType = double; #else - < F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 4, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, 7, 1>; + < F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 32, 32, 4, 1, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, 7, 1>; using ADataType = float; using BDataType = float; using CDataType = float; @@ -64,6 +64,23 @@ using AccDataType = float; using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +template +std::ostream& void show_2d_matrix(std::ostream& os, Tensor& matrix) +{ + os << "[" << std::endl; + for(int x = 0; x < matrix.mDesc.GetLengths()[0]; x++) + { + os << "["; + for(int y = 0; y < matrix.mDesc.GetLengths()[1]; y++) + { + os << std::setw(4) << static_cast(matrix(x, y)); + } + os << "]" << std::endl; + } + os << "]"; + return os; +} + int main(int argc, char* argv[]) { bool do_verification = 0; @@ -144,8 +161,8 @@ int main(int argc, char* argv[]) break; default: // a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); // b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); } @@ -208,13 +225,12 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); -#if 0 +#if 1 { - LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; - LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; - LogRangeAsType(std::cout << "c_device: ", c_m_n_device_result.mData, ",") << std::endl; - LogRangeAsType(std::cout << "c_host : ", c_m_n_host_result.mData, ",") - << std::endl; + show_2d_matrix(std::cout << "a : ", a_m_k) << std::endl; + show_2d_matrix(std::cout << "b: ", b_k_n) << std::endl; + show_2d_matrix(std::cout << "c_device: ", c_m_n_device_result) << std::endl; + show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl; } #endif ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp index e933dd3dc56..8c4008b36d1 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp @@ -49,14 +49,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device:: ConvFwdDefault, // ConvForwardSpecialization NumDimSpatial, // NumDimSpatial 256, // BlockSize - 128, // MPerBlock - 128, // NPerBlock + 64, // MPerBlock + 64, // NPerBlock 4, // K0PerBlock 2, // K1 16, // MPerXDL 16, // NPerXDL - 4, // MXdlPerWave - 4, // NXdlPerWave + 2, // MXdlPerWave + 2, // NXdlPerWave S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp index 4d9a40e9547..54916c2b433 100644 --- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp @@ -25,7 +25,7 @@ enum struct MfmaInstr mfma_f32_16x16x8bf16, mfma_i32_32x32x8i8, mfma_i32_16x16x16i8, - mfma_f64_16x16x4f64, + mfma_f64_16x16x4f64 }; template diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index e659f2d0e82..1326583bf62 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -63,8 +63,8 @@ struct ReferenceGemm : public device::BaseOperator AccDataType v_a; AccDataType v_b; - arg.a_element_op_(v_a, static_cast(arg.a_m_k_(m, k))); - arg.b_element_op_(v_b, static_cast(arg.b_k_n_(k, n))); + arg.a_element_op_(v_a, static_cast(arg.a_m_k_(m, k))); + arg.b_element_op_(v_b, static_cast(arg.b_k_n_(k, n))); v_acc += v_a * v_b; } From 7e8e54dead53f981b7e562c1eed6e80cb8702aac Mon Sep 17 00:00:00 2001 From: qinletao Date: Sun, 24 Apr 2022 12:36:26 +0000 Subject: [PATCH 10/32] change rc matrix egister layout --- example/01_gemm/gemm_xdl_fp64.cpp | 2 +- include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index bed6b9b1575..54a64d6eb5b 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -65,7 +65,7 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; template -std::ostream& void show_2d_matrix(std::ostream& os, Tensor& matrix) +std::ostream& show_2d_matrix(std::ostream& os, Tensor& matrix) { os << "[" << std::endl; for(int x = 0; x < matrix.mDesc.GetLengths()[0]; x++) diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp index 54916c2b433..a39b795818e 100644 --- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp @@ -387,8 +387,8 @@ struct mfma_type template <> struct mfma_type { - static constexpr index_t group_size = 4; - static constexpr index_t num_groups_per_blk = 1; + static constexpr index_t group_size = 1; + static constexpr index_t num_groups_per_blk = 4; static constexpr index_t num_regs_per_blk = 4; // group_size * num_groups_per_blk; static constexpr index_t num_threads_per_blk = 16; static constexpr index_t wave_size = 64; From 3991a1c132cd2cd51bbe074a02b0b5a4da6bc6b2 Mon Sep 17 00:00:00 2001 From: qinletao Date: Mon, 25 Apr 2022 03:22:03 +0000 Subject: [PATCH 11/32] reorganize example --- example/01_gemm/gemm_xdl_fp64.cpp | 32 +++++++++++-------- example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 12 +++---- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index 54a64d6eb5b..94cad6c4756 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -46,7 +46,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if 1 +#if 0 < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 32, 32, 4, 1, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, 7, 1>; using ADataType = double; using BDataType = double; @@ -59,10 +59,15 @@ using BDataType = float; using CDataType = float; using AccDataType = float; #endif -// clang-format on + // clang-format on -using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; template std::ostream& show_2d_matrix(std::ostream& os, Tensor& matrix) @@ -88,13 +93,13 @@ int main(int argc, char* argv[]) int nrepeat = 5; // GEMM shape - ck::index_t M = 3840; - ck::index_t N = 4096; - ck::index_t K = 4096; + ck::index_t M = 32; + ck::index_t N = 32; + ck::index_t K = 4; - ck::index_t StrideA = 4096; - ck::index_t StrideB = 4096; - ck::index_t StrideC = 4096; + ck::index_t StrideA = 4; + ck::index_t StrideB = 4; + ck::index_t StrideC = 32; if(argc == 4) { @@ -144,6 +149,7 @@ int main(int argc, char* argv[]) Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + std::cout << "data type: " << typeid(ADataType{}).name() << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; @@ -160,10 +166,8 @@ int main(int argc, char* argv[]) b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - // a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); a_m_k.GenerateTensorValue(GeneratorTensor_1{1}); - b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - // b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); + b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); } DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); @@ -225,7 +229,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); -#if 1 +#if 0 { show_2d_matrix(std::cout << "a : ", a_m_k) << std::endl; show_2d_matrix(std::cout << "b: ", b_k_n) << std::endl; diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp index 8c4008b36d1..cdda09db42e 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp @@ -49,14 +49,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device:: ConvFwdDefault, // ConvForwardSpecialization NumDimSpatial, // NumDimSpatial 256, // BlockSize - 64, // MPerBlock - 64, // NPerBlock + 128, // MPerBlock + 128, // NPerBlock 4, // K0PerBlock 2, // K1 16, // MPerXDL 16, // NPerXDL - 2, // MXdlPerWave - 2, // NXdlPerWave + 4, // MXdlPerWave + 4, // NXdlPerWave S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder @@ -241,10 +241,8 @@ int main(int argc, char* argv[]) weights.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); break; default: - // input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - weights.GenerateTensorValue(GeneratorTensor_2{-5, 5}); input.GenerateTensorValue(GeneratorTensor_1{1}); - // weights.GenerateTensorValue(GeneratorTensor_1{1}); + weights.GenerateTensorValue(GeneratorTensor_1{1}); } DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); From ea970541d0e125308a068d712974e71afce25d81 Mon Sep 17 00:00:00 2001 From: qinletao Date: Mon, 25 Apr 2022 03:56:53 +0000 Subject: [PATCH 12/32] reorganize example 2 --- example/01_gemm/gemm_xdl_fp64.cpp | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index 94cad6c4756..303c0e3b3b2 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -25,6 +25,11 @@ using F64 = double; using F32 = float; using F16 = ck::half_t; +using ADataType = double; +using BDataType = double; +using CDataType = double; +using AccDataType = double; + using Row = ck::tensor_layout::gemm::RowMajor; using Col = ck::tensor_layout::gemm::ColumnMajor; @@ -46,28 +51,11 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | -#if 0 < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 32, 32, 4, 1, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, 7, 1>; -using ADataType = double; -using BDataType = double; -using CDataType = double; -using AccDataType = double; -#else - < F32, F32, F32, F32, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 32, 32, 4, 1, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, 7, 1>; -using ADataType = float; -using BDataType = float; -using CDataType = float; -using AccDataType = float; -#endif - // clang-format on - - using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; template std::ostream& show_2d_matrix(std::ostream& os, Tensor& matrix) From b615d65336d6c48a39234697da5afd38ccd68c9f Mon Sep 17 00:00:00 2001 From: qinletao Date: Mon, 25 Apr 2022 09:22:51 +0000 Subject: [PATCH 13/32] format,because merge develop --- library/src/utility/conv_fwd_util.cpp | 35 +++++++++++++-------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/library/src/utility/conv_fwd_util.cpp b/library/src/utility/conv_fwd_util.cpp index fde2caa56b3..16584503887 100644 --- a/library/src/utility/conv_fwd_util.cpp +++ b/library/src/utility/conv_fwd_util.cpp @@ -37,16 +37,16 @@ std::size_t get_flops(ck::index_t N, } ConvParams::ConvParams() - : num_dim_spatial(2), - N(128), - K(256), - C(192), - filter_spatial_lengths(2, 3), - input_spatial_lengths(2, 71), - conv_filter_strides(2, 2), - conv_filter_dilations(2, 1), - input_left_pads(2, 1), - input_right_pads(2, 1) + : num_dim_spatial(2), + N(128), + K(256), + C(192), + filter_spatial_lengths(2, 3), + input_spatial_lengths(2, 71), + conv_filter_strides(2, 2), + conv_filter_dilations(2, 1), + input_left_pads(2, 1), + input_right_pads(2, 1) { } @@ -77,9 +77,9 @@ ConvParams::ConvParams(ck::index_t n_dim, conv_filter_dilations.size() != num_dim_spatial || input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) { - throw(std::runtime_error( - "ConvParams::GetOutputSpatialLengths: " - "parameter size is different from number of declared dimensions!")); + throw( + std::runtime_error("ConvParams::GetOutputSpatialLengths: " + "parameter size is different from number of declared dimensions!")); } } @@ -91,9 +91,9 @@ std::vector ConvParams::GetOutputSpatialLengths() const conv_filter_dilations.size() != num_dim_spatial || input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) { - throw(std::runtime_error( - "ConvParams::GetOutputSpatialLengths: " - "parameter size is different from number of declared dimensions!")); + throw( + std::runtime_error("ConvParams::GetOutputSpatialLengths: " + "parameter size is different from number of declared dimensions!")); } std::vector out_spatial_len(num_dim_spatial, 0); @@ -101,8 +101,7 @@ std::vector ConvParams::GetOutputSpatialLengths() const { // XEff = (X - 1) * conv_dilation_w + 1; // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; - const ck::index_t idx_eff = - (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1; + const ck::index_t idx_eff = (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1; out_spatial_len[i] = (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) / conv_filter_strides[i] + From 6bf08bb91a7f681e6d2f8c74161fa8b7a9921128 Mon Sep 17 00:00:00 2001 From: qinletao Date: Tue, 26 Apr 2022 00:16:35 +0000 Subject: [PATCH 14/32] fix call impl adding acc data type --- profiler/src/profile_grouped_gemm.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp index 88a2a8f855d..efa4b9e0f89 100644 --- a/profiler/src/profile_grouped_gemm.cpp +++ b/profiler/src/profile_grouped_gemm.cpp @@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[]) if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) { ck::profiler::profile_grouped_gemm_impl Date: Tue, 26 Apr 2022 02:29:58 +0000 Subject: [PATCH 15/32] lost ; --- test/gemm/gemm_fp32.cpp | 6 +++--- test/gemm/gemm_int8.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp index 72ff08f427f..89701511dc6 100644 --- a/test/gemm/gemm_fp32.cpp +++ b/test/gemm/gemm_fp32.cpp @@ -56,10 +56,10 @@ int main() using ADataType = float; using BDataType = float; using CDataType = float; - using AccDataType = float, + using AccDataType = float; - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; + using RowMajor = ck::tensor_layout::gemm::RowMajor; + using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; bool res = true; std::vector gemmPtrs; diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp index cc38da757a0..de18dee94ff 100644 --- a/test/gemm/gemm_int8.cpp +++ b/test/gemm/gemm_int8.cpp @@ -49,10 +49,10 @@ int main() using ADataType = int8_t; using BDataType = int8_t; using CDataType = int8_t; - using AccDataType = int32_t, + using AccDataType = int32_t; - using RowMajor = ck::tensor_layout::gemm::RowMajor; - using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; + using RowMajor = ck::tensor_layout::gemm::RowMajor; + using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; std::vector gemmPtrs; bool res = true; From bc7b53348b81a25b1122480739de63778a4cda08 Mon Sep 17 00:00:00 2001 From: ltqin Date: Tue, 26 Apr 2022 16:25:33 +0800 Subject: [PATCH 16/32] add compiler warning --- include/ck/utility/amd_xdlops.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index e4f24ee2d5a..5f305a49297 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -307,8 +307,10 @@ struct intrin_mfma_f64_16x16x4f64<16, 16> reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64( reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); #else - reg_c.template AsType()(Number<0>{}) = {reg_a, reg_a, reg_b, reg_b}; - +#pragma message "this GPU card don't support mfma_f64_16x16x4f64 instruction!" + ignore = reg_a; + ignore = reg_b; + ignore = reg_c; #endif } }; From 85ef3f285864bd6843e32ae7badd1a66c37dca17 Mon Sep 17 00:00:00 2001 From: qinletao Date: Fri, 29 Apr 2022 04:40:54 +0000 Subject: [PATCH 17/32] change example tunning parameters --- example/01_gemm/gemm_xdl_fp64.cpp | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index 303c0e3b3b2..eb0776b3620 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -51,11 +51,20 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | +#if 0 < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 32, 32, 4, 1, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, true, 7, 1>; -// clang-format on +#else + < F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>; +#endif + // clang-format on -using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; template std::ostream& show_2d_matrix(std::ostream& os, Tensor& matrix) @@ -81,13 +90,13 @@ int main(int argc, char* argv[]) int nrepeat = 5; // GEMM shape - ck::index_t M = 32; - ck::index_t N = 32; - ck::index_t K = 4; + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; - ck::index_t StrideA = 4; - ck::index_t StrideB = 4; - ck::index_t StrideC = 32; + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; if(argc == 4) { From 4a77f4536cdf07883a2d0c6edbe19193bc685dba Mon Sep 17 00:00:00 2001 From: ltqin Date: Fri, 29 Apr 2022 17:52:15 +0800 Subject: [PATCH 18/32] add test for fp64 --- .../gpu/gemm/CMakeLists.txt | 4 ++++ test/gemm/CMakeLists.txt | 4 ++++ test/gemm/gemm_util.hpp | 14 ++++++++++---- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index 5f057adcc5f..74527b1e265 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -1,5 +1,9 @@ # device_gemm_instance set(DEVICE_GEMM_INSTANCE_SOURCE + device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp; + device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp; + device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp; + device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp; device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp; device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp; device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp; diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt index 83b3c1e2e30..6bb4d3c723d 100644 --- a/test/gemm/CMakeLists.txt +++ b/test/gemm/CMakeLists.txt @@ -1,3 +1,7 @@ +add_test_executable(test_gemm_fp64 gemm_fp64.cpp) +target_link_libraries(test_gemm_fp64 PRIVATE host_tensor) +target_link_libraries(test_gemm_fp64 PRIVATE device_gemm_instance) + add_test_executable(test_gemm_fp32 gemm_fp32.cpp) target_link_libraries(test_gemm_fp32 PRIVATE host_tensor) target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance) diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index f8de85ea883..1be24e0fba5 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -140,12 +140,12 @@ struct TestGemm Tensor c_m_n_device_result( f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); - auto f_generate_tensor_value = [](auto desc, auto type) { + auto f_generate_tensor_value = [](auto& desc, auto type) { using dataType = decltype(type); - if(std::is_same::value) + if(std::is_same::value || std::is_same::value) { - desc.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + desc.GenerateTensorValue(GeneratorTensor_2{-5, 5}); } else { @@ -161,6 +161,7 @@ struct TestGemm auto operator()(DeviceGemmPtr_& gemmPtr) { + std::cout << "data type: " << typeid(ADataType{}).name() << std::endl; std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name << ", CLayout = " << CLayout{}.name << std::endl; std::cout << gemmPtr->GetTypeString() << std::endl; @@ -202,7 +203,12 @@ struct TestGemm // Assert bool res = false; - if(std::is_same::value) + if(std::is_same::value) + { + res = ck::utils::check_err(c_device.mData, c_host.mData); + std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; + } + else if(std::is_same::value) { res = ck::utils::check_err(c_device.mData, c_host.mData); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; From 75ef75beb96db75735046978572b5b9654f7e16d Mon Sep 17 00:00:00 2001 From: ltqin Date: Fri, 29 Apr 2022 17:55:12 +0800 Subject: [PATCH 19/32] add instance --- ...gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp | 50 +++++++++++++++++ ...gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp | 50 +++++++++++++++++ ...gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp | 50 +++++++++++++++++ ...gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp | 55 +++++++++++++++++++ 4 files changed, 205 insertions(+) create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp new file mode 100644 index 00000000000..7b19e7c38c2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp @@ -0,0 +1,50 @@ +#include +#include "config.hpp" +#include "device_gemm_xdl.hpp" +#include "element_wise_operation.hpp" +#include "device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_gemm_instance { + +using F64 = double; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +// Compilation parameters for a[k, m] * b[k, n] = c[m, n] +using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances = + std::tuple< + // clang-format off + //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| + //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| + //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| + //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 128, 4, 2, 16, 16, 8, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 64, 4, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 128, 4, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 2, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1> + // clang-format on + >; + +void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_km_kn_mn_instances{}); +} + +} // namespace device_gemm_instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp new file mode 100644 index 00000000000..d903030cc49 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp @@ -0,0 +1,50 @@ +#include +#include "config.hpp" +#include "device_gemm_xdl.hpp" +#include "element_wise_operation.hpp" +#include "device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_gemm_instance { + +using F64 = double; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +// Compilation parameters for a[k, m] * b[n, k] = c[m, n] +using device_gemm_xdl_f64_f64_f64_km_nk_mn_instances = + std::tuple< + // clang-format off + //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| + //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| + //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| + //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 128, 4, 2, 16, 16, 8, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 64, 4, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 128, 4, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1> + // clang-format on + >; + +void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_km_nk_mn_instances{}); +} + +} // namespace device_gemm_instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp new file mode 100644 index 00000000000..f021f018da0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp @@ -0,0 +1,50 @@ +#include +#include "config.hpp" +#include "device_gemm_xdl.hpp" +#include "element_wise_operation.hpp" +#include "device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_gemm_instance { + +using F64 = double; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +// Compilation parameters for a[m, k] * b[k, n] = c[m, n] +using device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances = + std::tuple< + // clang-format off + //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| + //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| + //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| + //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 128, 4, 2, 16, 16, 8, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 64, 4, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 128, 4, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1> + // clang-format on + >; + +void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances{}); +} + +} // namespace device_gemm_instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp new file mode 100644 index 00000000000..e34022fca3e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp @@ -0,0 +1,55 @@ +#include +#include "config.hpp" +#include "device_gemm_xdl.hpp" +#include "element_wise_operation.hpp" +#include "device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_gemm_instance { + +using F64 = double; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +// Compilation parameters for a[m, k] * b[n, k] = c[m, n] +using device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances = + std::tuple< + // clang-format off + //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| + //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| + //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| + //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 128, 4, 2, 16, 16, 8, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 64, 4, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 64, 4, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 64, 128, 4, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 32, 4, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 32, 128, 4, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 64, 32, 4, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, + DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 64, 32, 64, 4, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1> + // clang-format on + >; + +void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances{}); +} + +} // namespace device_gemm_instance +} // namespace device +} // namespace tensor_operation +} // namespace ck From 94fad45b0b21b60f420e55113fb335be7caea355 Mon Sep 17 00:00:00 2001 From: ltqin Date: Fri, 29 Apr 2022 17:56:53 +0800 Subject: [PATCH 20/32] add test/gemm/gemm_fp64.cpp --- test/gemm/gemm_fp64.cpp | 136 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 test/gemm/gemm_fp64.cpp diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp new file mode 100644 index 00000000000..fa95d1d6380 --- /dev/null +++ b/test/gemm/gemm_fp64.cpp @@ -0,0 +1,136 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "gemm_util.hpp" +#include "config.hpp" +#include "print.hpp" +#include "device.hpp" +#include "host_tensor.hpp" +#include "host_tensor_generator.hpp" +#include "host_gemm.hpp" +#include "device_tensor.hpp" +#include "device_gemm_xdl.hpp" +#include "device_gemm_xdl_c_shuffle.hpp" +#include "element_wise_operation.hpp" +#include "reference_gemm.hpp" +#include "gemm_specialization.hpp" + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using DeviceGemmNoOpPtr = + ck::tensor_operation::device::DeviceGemmPtr; + +namespace ck { +namespace tensor_operation { +namespace device { +namespace device_gemm_instance { +void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(std::vector&); +void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(std::vector&); +void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(std::vector&); +void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(std::vector&); + +} // namespace device_gemm_instance +} // namespace device +} // namespace tensor_operation +} // namespace ck + +int main() +{ +#ifdef __gfx90a__ + using ADataType = double; + using BDataType = double; + using CDataType = double; + using AccDataType = double; + + using RowMajor = ck::tensor_layout::gemm::RowMajor; + using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; + + bool res = true; + std::vector gemmPtrs; + ck::tensor_operation::device::device_gemm_instance:: + add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(gemmPtrs); + + for(auto& gemmPtr : gemmPtrs) + { + res &= ck::gemm_util::TestGemm{}(gemmPtr); + } + + gemmPtrs.clear(); + ck::tensor_operation::device::device_gemm_instance:: + add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(gemmPtrs); + + for(auto& gemmPtr : gemmPtrs) + { + res &= ck::gemm_util::TestGemm{}(gemmPtr); + } + + gemmPtrs.clear(); + ck::tensor_operation::device::device_gemm_instance:: + add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(gemmPtrs); + + for(auto& gemmPtr : gemmPtrs) + { + res &= ck::gemm_util::TestGemm{}(gemmPtr); + } + + gemmPtrs.clear(); + ck::tensor_operation::device::device_gemm_instance:: + add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(gemmPtrs); + + for(auto& gemmPtr : gemmPtrs) + { + res &= ck::gemm_util::TestGemm{}(gemmPtr); + } +#else + bool res = true; +#endif + std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; + return res ? 0 : 1; +} From 1286072ae2a3a5fbfd887af7887ed8b873713cc3 Mon Sep 17 00:00:00 2001 From: qinletao Date: Fri, 29 Apr 2022 11:07:25 +0000 Subject: [PATCH 21/32] fix get name issue --- test/gemm/gemm_fp64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp index 28e5a453976..7714290b87d 100644 --- a/test/gemm/gemm_fp64.cpp +++ b/test/gemm/gemm_fp64.cpp @@ -63,7 +63,7 @@ inline std::string get_device_name() int main() { - if(get_device_name() != "gfx90a") + if(get_device_name().find("gfx90a") == std::string::npos) { std::cout << "TestGemm ..... SUCCESS" << std::endl; return 0; From 04397fa0736f7ae20c9b94b85a818a6528630e29 Mon Sep 17 00:00:00 2001 From: ltqin Date: Fri, 29 Apr 2022 19:11:22 +0800 Subject: [PATCH 22/32] remove some tunning parameter --- include/ck/utility/amd_xdlops.hpp | 1 - ...gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp | 1 - ...gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp | 1 - ...gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp | 1 - ...gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp | 1 - test/gemm/gemm_fp64.cpp | 29 ++++++++++++++++--- 6 files changed, 25 insertions(+), 9 deletions(-) diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index 5f305a49297..47ab8b3b4d9 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -307,7 +307,6 @@ struct intrin_mfma_f64_16x16x4f64<16, 16> reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64( reg_a, reg_b, reg_c.template AsType()[Number<0>{}], 0, 0, 0); #else -#pragma message "this GPU card don't support mfma_f64_16x16x4f64 instruction!" ignore = reg_a; ignore = reg_b; ignore = reg_c; diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp index 7b19e7c38c2..fdc85dfc710 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp @@ -29,7 +29,6 @@ using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances = //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 128, 4, 2, 16, 16, 8, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Col, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, 7, 1>, diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp index d903030cc49..e400cd9bbba 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp @@ -29,7 +29,6 @@ using device_gemm_xdl_f64_f64_f64_km_nk_mn_instances = //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 128, 4, 2, 16, 16, 8, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Col, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp index f021f018da0..2f9241b93b3 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp @@ -29,7 +29,6 @@ using device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances = //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 128, 4, 2, 16, 16, 8, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Row, Row, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 2, true, 7, 1>, diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp index e34022fca3e..537fe2bdae7 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp @@ -29,7 +29,6 @@ using device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances = //##########| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| //##########| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| //##########| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 128, 4, 2, 16, 16, 8, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 256, 128, 128, 4, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 128, 64, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, DeviceGemmXdl< F64, F64, F64, F64, Row, Col, Row, PassThrough, PassThrough, PassThrough, GemmDefault, 128, 64, 128, 4, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 7, 1>, diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp index fa95d1d6380..28e5a453976 100644 --- a/test/gemm/gemm_fp64.cpp +++ b/test/gemm/gemm_fp64.cpp @@ -41,9 +41,33 @@ void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(std::vector{}(gemmPtr); } -#else - bool res = true; -#endif std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; return res ? 0 : 1; } From 345acac1e974881d3e8fc34ef0be850fe1bab1b0 Mon Sep 17 00:00:00 2001 From: ltqin Date: Fri, 29 Apr 2022 19:47:56 +0800 Subject: [PATCH 23/32] fix conflict --- test/gemm/gemm_util.hpp | 343 ---------------------------------------- 1 file changed, 343 deletions(-) diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 75f01a35a8e..1be24e0fba5 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -1,4 +1,3 @@ -<<<<<<< HEAD #ifndef GEMM_UTILS_HPP #define GEMM_UTILS_HPP @@ -348,345 +347,3 @@ struct TestGemmBF16 } // namespace gemm_util } // namespace ck #endif -======= -#ifndef GEMM_UTILS_HPP -#define GEMM_UTILS_HPP - -#include "check_err.hpp" -#include "config.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "reference_gemm.hpp" -#include "tensor_layout.hpp" - -namespace ck { -namespace gemm_util { - -struct GemmParams -{ - GemmParams() - : M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0) - { - } - - ck::index_t M; - ck::index_t N; - ck::index_t K; - - ck::index_t StrideA; - ck::index_t StrideB; - ck::index_t StrideC; - - float alpha; - float beta; -}; - -template -void RunHostGEMM(const Tensor& A, - const Tensor& B, - Tensor& C, - AElementwiseOperation a_element_op, - BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op) -{ - auto ref_gemm = GemmInstance{}; - auto ref_invoker = ref_gemm.MakeInvoker(); - - auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op); - - ref_invoker.Run(ref_argument); -} - -template -void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, - const ck::gemm_util::GemmParams& params, - const Tensor& A, - const Tensor& B, - Tensor& C, - AElementwiseOperation a_element_op, - BElementwiseOperation b_element_op, - CElementwiseOperation c_element_op) -{ - DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace()); - DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace()); - DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace()); - - a_m_k_device_buf.ToDevice(A.mData.data()); - b_k_n_device_buf.ToDevice(B.mData.data()); - - auto invoker_ptr = gemmPtr->MakeInvokerPointer(); - auto argument_ptr = - gemmPtr->MakeArgumentPointer(static_cast(a_m_k_device_buf.GetDeviceBuffer()), - static_cast(b_k_n_device_buf.GetDeviceBuffer()), - static_cast(c_m_n_device_buf.GetDeviceBuffer()), - params.M, - params.N, - params.K, - params.StrideA, - params.StrideB, - params.StrideC, - a_element_op, - b_element_op, - c_element_op); - - if(!gemmPtr->IsSupportedArgument(argument_ptr.get())) - { - throw std::runtime_error( - "wrong! device_gemm with the specified compilation parameters does " - "not support this GEMM problem"); - } - - invoker_ptr->Run(argument_ptr.get()); - c_m_n_device_buf.FromDevice(C.mData.data()); -} - -template -struct TestGemm -{ - auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params) - { - auto f_host_tensor_descriptor = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - if(std::is_same::value) - { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); - } - else - { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); - } - }; - - Tensor a_m_k( - f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); - Tensor b_k_n( - f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); - Tensor c_m_n_host_result( - f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); - Tensor c_m_n_device_result( - f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); - - auto f_generate_tensor_value = [](auto& desc, auto type) { - using dataType = decltype(type); - - if(std::is_same::value) - { - desc.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - } - else - { - desc.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } - }; - - f_generate_tensor_value(a_m_k, ADataType{}); - f_generate_tensor_value(b_k_n, BDataType{}); - - return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result); - } - - auto operator()(DeviceGemmPtr_& gemmPtr) - { - std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name - << ", CLayout = " << CLayout{}.name << std::endl; - std::cout << gemmPtr->GetTypeString() << std::endl; - - // Arrange - ck::gemm_util::GemmParams params; - params.M = 1024; - params.N = 1024; - params.K = 1024; - params.StrideA = 1024; - params.StrideB = 1024; - params.StrideC = 1024; - - auto host_tensors = PrepareGemmTensor(params); - - const Tensor& a = std::get<0>(host_tensors); - const Tensor& b = std::get<1>(host_tensors); - Tensor& c_host = std::get<2>(host_tensors); - Tensor& c_device = std::get<3>(host_tensors); - - auto a_element_op = AElementwiseOperation{}; - auto b_element_op = BElementwiseOperation{}; - auto c_element_op = CElementwiseOperation{}; - - using ReferenceGemmInstance = - ck::tensor_operation::host::ReferenceGemm; - ck::gemm_util::RunHostGEMM( - a, b, c_host, a_element_op, b_element_op, c_element_op); - - // Act - ck::gemm_util::RunDeviceGEMM( - gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op); - - // Assert - bool res = false; - if(std::is_same::value) - { - res = ck::utils::check_err(c_device.mData, c_host.mData); - std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; - } - else if(std::is_same::value) - { - res = ck::utils::check_err(c_device.mData, c_host.mData); - std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; - } - else if(std::is_same::value) - { - res = ck::utils::check_err(c_device.mData, c_host.mData); - std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; - } - - return res; - } -}; - -template -struct TestGemmBF16 -{ - using BF16 = ck::bhalf_t; - - auto PrepareGemmTensorBF16(const ck::gemm_util::GemmParams& params) - { - auto f_host_tensor_descriptor = - [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { - if(std::is_same::value) - { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); - } - else - { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); - } - }; - - // use fp32 host kernel to verify bf16 device kernel - Tensor a_m_k_bf16( - f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); - Tensor b_k_n_bf16( - f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); - Tensor c_m_n_device_bf16( - f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); - - Tensor a_m_k_fp32( - f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); - Tensor b_k_n_fp32( - f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); - Tensor c_m_n_host_fp32( - f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); - Tensor c_m_n_device_fp32( - f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); - - a_m_k_bf16.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - b_k_n_bf16.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - - bf16_to_f32_(a_m_k_bf16, a_m_k_fp32); - bf16_to_f32_(b_k_n_bf16, b_k_n_fp32); - - return std::make_tuple(a_m_k_bf16, - b_k_n_bf16, - c_m_n_device_bf16, - a_m_k_fp32, - b_k_n_fp32, - c_m_n_host_fp32, - c_m_n_device_fp32); - } - - auto operator()(DeviceGemmPtr_& gemmPtr) - { - // Arrange - ck::gemm_util::GemmParams params; - params.M = 1024; - params.N = 1024; - params.K = 1024; - params.StrideA = 1024; - params.StrideB = 1024; - params.StrideC = 1024; - - auto host_tensors = PrepareGemmTensorBF16(params); - const Tensor& a_bf16 = std::get<0>(host_tensors); - const Tensor& b_bf16 = std::get<1>(host_tensors); - Tensor& c_device_bf16 = std::get<2>(host_tensors); - Tensor& a_fp32 = std::get<3>(host_tensors); - Tensor& b_fp32 = std::get<4>(host_tensors); - Tensor& c_host_fp32 = std::get<5>(host_tensors); - Tensor& c_device_fp32 = std::get<6>(host_tensors); - - auto a_element_op = AElementwiseOperation{}; - auto b_element_op = BElementwiseOperation{}; - auto c_element_op = CElementwiseOperation{}; - - // use fp32 host kernel to verify bf16 device kernel - using ReferenceGemmInstance = - ck::tensor_operation::host::ReferenceGemm; - ck::gemm_util::RunHostGEMM( - a_fp32, b_fp32, c_host_fp32, a_element_op, b_element_op, c_element_op); - - // Act - ck::gemm_util::RunDeviceGEMM(gemmPtr, - params, - a_bf16, - b_bf16, - c_device_bf16, - a_element_op, - b_element_op, - c_element_op); - - bf16_to_f32_(c_device_bf16, c_device_fp32); - - // Assert - bool res = ck::utils::check_err( - c_device_fp32.mData, c_host_fp32.mData, "Error: incorrect results!", 1e-2f, 1e-3f); - std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; - - return res; - }; -}; - -} // namespace gemm_util -} // namespace ck -#endif ->>>>>>> develop From 58f4d82119e72da7765940b8b0a47a8c1a068344 Mon Sep 17 00:00:00 2001 From: ltqin Date: Fri, 29 Apr 2022 20:32:23 +0800 Subject: [PATCH 24/32] format --- .../tensor_operation/gpu/device/device_batched_gemm_xdl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp index 56ec5a7f2c9..eda68234248 100644 --- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp @@ -385,8 +385,8 @@ struct DeviceBatchedGemmXdl c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)}, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{}, compute_ptr_offset_of_batch_{a_grid_desc_k0_m_k1_.GetElementSpaceSize(), - b_grid_desc_k0_n_k1_.GetElementSpaceSize(), - c_grid_desc_m_n_.GetElementSpaceSize()}, + b_grid_desc_k0_n_k1_.GetElementSpaceSize(), + c_grid_desc_m_n_.GetElementSpaceSize()}, block_2_ctile_map_{}, M01_{M01}, N01_{N01}, From 579e8e7691256d66955f9fb8417a12289a2325ae Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 30 Apr 2022 00:16:31 +0000 Subject: [PATCH 25/32] use integer value for GEMM test --- test/gemm/gemm_util.hpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 5f657a543c3..17e954b7f2c 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -139,17 +139,10 @@ struct TestGemm Tensor c_m_n_device_result( f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); - auto f_generate_tensor_value = [](auto& desc, auto type) { + auto f_generate_tensor_value = [](auto& tensor, auto type) { using dataType = decltype(type); - if(std::is_same::value) - { - desc.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - } - else - { - desc.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } + tensor.GenerateTensorValue(GeneratorTensor_2{-5, 5}); }; f_generate_tensor_value(a_m_k, ADataType{}); From 9e8cb769bfd72c4c043c5cd9b799e4fa935d3eb5 Mon Sep 17 00:00:00 2001 From: qinletao Date: Sat, 30 Apr 2022 09:00:04 +0000 Subject: [PATCH 26/32] add acc data type --- test/gemm/gemm_util.hpp | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 17e954b7f2c..937ba0571f6 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -106,6 +106,7 @@ template ::value) - { - res = ck::utils::check_err(c_device.mData, c_host.mData); - std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; - } - else if(std::is_same::value) - { - res = ck::utils::check_err(c_device.mData, c_host.mData); - std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; - } + res = ck::utils::check_err(c_device.mData, c_host.mData); + std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; return res; } @@ -299,6 +289,7 @@ struct TestGemmBF16 // use fp32 host kernel to verify bf16 device kernel using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm Date: Sat, 30 Apr 2022 10:36:08 +0000 Subject: [PATCH 27/32] remove typeid because fp16 --- test/gemm/gemm_util.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 937ba0571f6..fee069b4ddd 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -154,7 +154,6 @@ struct TestGemm auto operator()(DeviceGemmPtr_& gemmPtr) { - std::cout << "Data type: " << typeid(CDataType{}).name() << std::endl; std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name << ", CLayout = " << CLayout{}.name << std::endl; std::cout << gemmPtr->GetTypeString() << std::endl; From 42b0321e113155fc4de20f12c03c92ed60e79ef1 Mon Sep 17 00:00:00 2001 From: qinletao Date: Wed, 25 May 2022 02:56:02 +0000 Subject: [PATCH 28/32] fix streamconfig etc bug from merging develop --- example/01_gemm/gemm_xdl_fp64.cpp | 11 +-- example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 98 +++++++++---------- test/gemm/CMakeLists.txt | 6 +- test/gemm/gemm_util.hpp | 6 ++ .../gemm/{gemm_fp64.cpp => gemm_xdl_fp64.cpp} | 1 - 5 files changed, 63 insertions(+), 59 deletions(-) rename test/gemm/{gemm_fp64.cpp => gemm_xdl_fp64.cpp} (96%) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index eb0776b3620..ff05c3d5417 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -12,7 +12,6 @@ #include "host_tensor_generator.hpp" #include "device_tensor.hpp" #include "device_gemm_xdl.hpp" -#include "device_gemm_xdl_c_shuffle.hpp" #include "device_gemm_xdl_cshuffle.hpp" #include "element_wise_operation.hpp" #include "reference_gemm.hpp" @@ -86,8 +85,8 @@ std::ostream& show_2d_matrix(std::ostream& os, Tensor& matrix) int main(int argc, char* argv[]) { bool do_verification = 0; - int init_method = 0; - int nrepeat = 5; + int init_method = 0; + bool time_kernel = false; // GEMM shape ck::index_t M = 3840; @@ -102,13 +101,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); @@ -201,7 +200,7 @@ int main(int argc, char* argv[]) "not support this GEMM problem"); } - float ave_time = invoker.Run(argument, nrepeat); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); std::size_t flop = std::size_t(2) * M * N * K; std::size_t num_btype = diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp index cdda09db42e..7149c4f2779 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp @@ -5,7 +5,7 @@ #include "check_err.hpp" #include "config.hpp" -#include "conv_fwd_util.hpp" +#include "conv_util.hpp" #include "device.hpp" #include "device_tensor.hpp" #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" @@ -134,40 +134,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha ck::utils::conv::ConvParams params; int arg_idx = 5; - params.num_dim_spatial = num_dim_spatial; - params.N = std::stoi(argv[arg_idx++]); - params.K = std::stoi(argv[arg_idx++]); - params.C = std::stoi(argv[arg_idx++]); + params.num_dim_spatial_ = num_dim_spatial; + params.N_ = std::stoi(argv[arg_idx++]); + params.K_ = std::stoi(argv[arg_idx++]); + params.C_ = std::stoi(argv[arg_idx++]); - params.filter_spatial_lengths.resize(num_dim_spatial); + params.filter_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.input_spatial_lengths.resize(num_dim_spatial); + params.input_spatial_lengths_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_strides.resize(num_dim_spatial); + params.conv_filter_strides_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]); } - params.conv_filter_dilations.resize(num_dim_spatial); + params.conv_filter_dilations_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]); } - params.input_left_pads.resize(num_dim_spatial); + params.input_left_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + params.input_left_pads_[i] = std::stoi(argv[arg_idx++]); } - params.input_right_pads.resize(num_dim_spatial); + params.input_right_pads_.resize(num_dim_spatial); for(int i = 0; i < num_dim_spatial; ++i) { - params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + params.input_right_pads_[i] = std::stoi(argv[arg_idx++]); } return params; @@ -181,7 +181,7 @@ int main(int argc, char* argv[]) bool do_verification = 0; int init_method = 0; - int nrepeat = 5; + bool time_kernel = false; int num_dim_spatial = 2; ck::utils::conv::ConvParams params; @@ -190,7 +190,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - nrepeat = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); } @@ -199,21 +199,21 @@ int main(int argc, char* argv[]) params = parse_conv_params(num_dim_spatial, argc, argv); } - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; + std::vector input_dims{static_cast(params.N_), + static_cast(params.C_)}; input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); + std::begin(params.input_spatial_lengths_), + std::end(params.input_spatial_lengths_)); - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; + std::vector filter_dims{static_cast(params.K_), + static_cast(params.C_)}; filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); + std::begin(params.filter_spatial_lengths_), + std::end(params.filter_spatial_lengths_)); const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; + std::vector output_dims{static_cast(params.N_), + static_cast(params.K_)}; output_dims.insert(std::end(output_dims), std::begin(output_spatial_lengths), std::end(output_spatial_lengths)); @@ -259,16 +259,16 @@ int main(int argc, char* argv[]) conv->MakeArgumentPointer(static_cast(in_device_buf.GetDeviceBuffer()), static_cast(wei_device_buf.GetDeviceBuffer()), static_cast(out_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, + params.N_, + params.K_, + params.C_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); @@ -280,16 +280,16 @@ int main(int argc, char* argv[]) "not support this Conv problem"); } - float ave_time = invoker->Run(argument.get(), nrepeat); + float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel}); std::size_t flop = get_flops( - params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); + params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths); std::size_t num_btype = - get_btype(params.N, - params.C, - params.K, - params.input_spatial_lengths, - params.filter_spatial_lengths, + get_btype(params.N_, + params.C_, + params.K_, + params.input_spatial_lengths_, + params.filter_spatial_lengths_, output_spatial_lengths); float tflops = static_cast(flop) / 1.E9 / ave_time; @@ -305,10 +305,10 @@ int main(int argc, char* argv[]) auto ref_argument = ref_conv.MakeArgument(input, weights, host_output, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, + params.conv_filter_strides_, + params.conv_filter_dilations_, + params.input_left_pads_, + params.input_right_pads_, InElementOp{}, WeiElementOp{}, OutElementOp{}); diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt index 014b9e5e0cf..6716bbb1b5c 100644 --- a/test/gemm/CMakeLists.txt +++ b/test/gemm/CMakeLists.txt @@ -1,7 +1,7 @@ # GEMM XDL -add_test_executable(test_gemm_fp64 gemm_fp64.cpp) -target_link_libraries(test_gemm_fp64 PRIVATE host_tensor) -target_link_libraries(test_gemm_fp64 PRIVATE device_gemm_instance) +add_test_executable(test_gemm_xdl_fp64 gemm_xdl_fp64.cpp) +target_link_libraries(test_gemm_xdl_fp64 PRIVATE host_tensor) +target_link_libraries(test_gemm_xdl_fp64 PRIVATE device_gemm_instance) add_test_executable(test_gemm_xdl_fp32 gemm_xdl_fp32.cpp) target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor) diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index e95118351b1..94151f9eed7 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -217,6 +217,12 @@ struct TestGemm res = ck::utils::check_err(c_device.mData, c_host.mData); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } + else if(std::is_same::value) + { + res = ck::utils::check_err(c_device.mData, c_host.mData); + std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; + } + return res; } diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_xdl_fp64.cpp similarity index 96% rename from test/gemm/gemm_fp64.cpp rename to test/gemm/gemm_xdl_fp64.cpp index 7714290b87d..db37211505d 100644 --- a/test/gemm/gemm_fp64.cpp +++ b/test/gemm/gemm_xdl_fp64.cpp @@ -15,7 +15,6 @@ #include "host_gemm.hpp" #include "device_tensor.hpp" #include "device_gemm_xdl.hpp" -#include "device_gemm_xdl_c_shuffle.hpp" #include "element_wise_operation.hpp" #include "reference_gemm.hpp" #include "gemm_specialization.hpp" From 1df7eba2e083a24d84b4ffc736e03ad3d3e52cc5 Mon Sep 17 00:00:00 2001 From: qinletao Date: Wed, 25 May 2022 02:58:23 +0000 Subject: [PATCH 29/32] format --- example/01_gemm/gemm_xdl_fp64.cpp | 6 +++--- example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp | 2 +- test/gemm/gemm_util.hpp | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index ff05c3d5417..150d547264e 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -85,7 +85,7 @@ std::ostream& show_2d_matrix(std::ostream& os, Tensor& matrix) int main(int argc, char* argv[]) { bool do_verification = 0; - int init_method = 0; + int init_method = 0; bool time_kernel = false; // GEMM shape @@ -101,13 +101,13 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); } else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); M = std::stoi(argv[4]); N = std::stoi(argv[5]); diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp index 7149c4f2779..52440e0d5f1 100644 --- a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp +++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp @@ -190,7 +190,7 @@ int main(int argc, char* argv[]) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); + time_kernel = std::stoi(argv[3]); num_dim_spatial = std::stoi(argv[4]); } diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 94151f9eed7..a3cafa6df16 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -223,7 +223,6 @@ struct TestGemm std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } - return res; } else From bb4b9f088a5405cdb63ac6bb5d46e26fa3f432c3 Mon Sep 17 00:00:00 2001 From: qinletao Date: Wed, 25 May 2022 03:05:01 +0000 Subject: [PATCH 30/32] remove test_gemm_xdl_fp64 --- test/gemm/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt index 6716bbb1b5c..b8679e37157 100644 --- a/test/gemm/CMakeLists.txt +++ b/test/gemm/CMakeLists.txt @@ -1,8 +1,4 @@ # GEMM XDL -add_test_executable(test_gemm_xdl_fp64 gemm_xdl_fp64.cpp) -target_link_libraries(test_gemm_xdl_fp64 PRIVATE host_tensor) -target_link_libraries(test_gemm_xdl_fp64 PRIVATE device_gemm_instance) - add_test_executable(test_gemm_xdl_fp32 gemm_xdl_fp32.cpp) target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor) target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance) From 2b6d3e4c62604327b62b2dcb0500a75cfe6cefd5 Mon Sep 17 00:00:00 2001 From: qinletao Date: Wed, 25 May 2022 06:58:32 +0000 Subject: [PATCH 31/32] add AccDataType --- test/gemm/gemm_dl_fp16.cpp | 11 ++++++++--- test/gemm/gemm_dl_fp32.cpp | 11 ++++++++--- test/gemm/gemm_dl_int8.cpp | 11 ++++++++--- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/test/gemm/gemm_dl_fp16.cpp b/test/gemm/gemm_dl_fp16.cpp index 6165355ec41..8a539372bad 100644 --- a/test/gemm/gemm_dl_fp16.cpp +++ b/test/gemm/gemm_dl_fp16.cpp @@ -43,9 +43,10 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(std::vector Date: Wed, 25 May 2022 08:05:35 +0000 Subject: [PATCH 32/32] AccDataType problem --- example/01_gemm/gemm_dl_fp16.cpp | 2 +- example/01_gemm/gemm_dl_fp32.cpp | 2 +- example/01_gemm/gemm_dl_int8.cpp | 2 +- .../gemm_xdl_requant_relu_requant_int8.cpp | 2 +- example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp | 3 ++- example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp | 1 + 6 files changed, 7 insertions(+), 5 deletions(-) diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/example/01_gemm/gemm_dl_fp16.cpp index 6e8e04f9e51..63d96a8e991 100644 --- a/example/01_gemm/gemm_dl_fp16.cpp +++ b/example/01_gemm/gemm_dl_fp16.cpp @@ -52,7 +52,7 @@ using DeviceGemmInstance = ck::tensor_operation::device:: // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/example/01_gemm/gemm_dl_fp32.cpp index 65c806bf07e..20ca1a4d3d0 100644 --- a/example/01_gemm/gemm_dl_fp32.cpp +++ b/example/01_gemm/gemm_dl_fp32.cpp @@ -51,7 +51,7 @@ using DeviceGemmInstance = ck::tensor_operation::device:: // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/01_gemm/gemm_dl_int8.cpp b/example/01_gemm/gemm_dl_int8.cpp index a9590030c7f..caedb22537b 100644 --- a/example/01_gemm/gemm_dl_int8.cpp +++ b/example/01_gemm/gemm_dl_int8.cpp @@ -49,7 +49,7 @@ using DeviceGemmInstance = ck::tensor_operation::device:: // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp index c4bbe3cf4d2..a42df2b7f06 100644 --- a/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp +++ b/example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp @@ -103,7 +103,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp index 4d837c4675c..ef3dc03ebc7 100644 --- a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp +++ b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp @@ -32,6 +32,7 @@ using CDataType = F16; using ReduceAccDataType = F32; using DDataType = F64; using DPtrsGlobal = ck::Tuple; +using AccDataType = F32; using ALayout = ck::tensor_layout::gemm::RowMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor; @@ -59,7 +60,7 @@ using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_ // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; + ReferenceGemm; int main(int argc, char* argv[]) { diff --git a/example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp b/example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp index f818059f4a6..2b58eb20880 100644 --- a/example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp +++ b/example/16_gemm_reduce/gemm_reduce_xdl_sum_squaresum_fp16.cpp @@ -32,6 +32,7 @@ using CDataType = F16; using ReduceAccDataType = F32; using DDataType = F32; using DPtrsGlobal = ck::Tuple; +using AccDataType = F32; using ALayout = ck::tensor_layout::gemm::RowMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor;